About GPU Memory
Thanks for the awesome work!
According to the reported runtime-and-gpu-memory, VGGT only consumes 1.88G given a single input frame.
However, upon my own attempt following the official guideline, the released 1B model consumes 7~8G memory given a single input frame on an Nvidia A100 GPU. I am wondering how can I reproduce the results mentioned in runtime-and-gpu-memory?
Thanks in advance!
Hi @GuangyuWang99 ,
Can you check such a script?
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map
print("Initializing and loading VGGT model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
model.eval()
# Load and preprocess example images (replace with your own image paths)
image_names = ["examples/kitchen/images/00.png", "examples/kitchen/images/01.png", "examples/kitchen/images/02.png", "examples/kitchen/images/03.png"]
images = load_and_preprocess_images(image_names).to(device)
torch.cuda.synchronize()
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
memory_before = torch.cuda.memory_allocated() / (1024 * 1024 * 1024) # GB
with torch.no_grad():
with torch.cuda.amp.autocast(dtype=dtype):
images = images[None] # add batch dimension
aggregated_tokens_list, ps_idx = model.aggregator(images)
# uncomment below if you hope to include them
# Predict Cameras
# pose_enc = model.camera_head(aggregated_tokens_list)[-1]
# Predict Depth Maps
# depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx, frames_chunk_size=1)
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024)
memory_used_gb = peak_memory_gb - memory_before
print(f"Memory used: {memory_used_gb:.4f} GB")
@jytime I am facing similar issues with a disparity between the reported memory demands of the model and my actual experience. This is my code:
import pickle
import os
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map
# read pickle
with open(f"volleyball_references.pkl", "rb") as f:
references = pickle.load(f)
print("Initializing and loading VGGT model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
model.eval()
# Load and preprocess example images (replace with your own image paths)
# image_names = ["examples/kitchen/images/00.png", "examples/kitchen/images/01.png", "examples/kitchen/images/02.png", "examples/kitchen/images/03.png"]
images, _ = load_and_preprocess_images(references, mode="pad")
images = images.to(device)[:100]
torch.cuda.synchronize()
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
memory_before = torch.cuda.memory_allocated() / (1024 * 1024 * 1024) # GB
with torch.no_grad():
with torch.cuda.amp.autocast(dtype=dtype):
images = images[None] # add batch dimension
aggregated_tokens_list, ps_idx = model.aggregator(images)
# uncomment below if you hope to include them
# Predict Cameras
# pose_enc = model.camera_head(aggregated_tokens_list)[-1]
# Predict Depth Maps
# depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx, frames_chunk_size=1)
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024)
memory_used_gb = peak_memory_gb - memory_before
print(f"Memory used: {memory_used_gb:.4f} GB")
I am using a A100 with 82GB, and it is taking up 31 GB running on 100 images. I am hoping to run around 300 images so it is just out of memory.
Hi @GuangyuWang99 ,
Can you check such a script?
import torch from vggt.models.vggt import VGGT from vggt.utils.load_fn import load_and_preprocess_images from vggt.utils.pose_enc import pose_encoding_to_extri_intri from vggt.utils.geometry import unproject_depth_map_to_point_map print("Initializing and loading VGGT model...") device = "cuda" if torch.cuda.is_available() else "cpu" # bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 # Initialize the model and load the pretrained weights. # This will automatically download the model weights the first time it's run, which may take a while. model = VGGT.from_pretrained("facebook/VGGT-1B").to(device) model.eval() # Load and preprocess example images (replace with your own image paths) image_names = ["examples/kitchen/images/00.png", "examples/kitchen/images/01.png", "examples/kitchen/images/02.png", "examples/kitchen/images/03.png"] images = load_and_preprocess_images(image_names).to(device) torch.cuda.synchronize() torch.cuda.reset_peak_memory_stats() torch.cuda.empty_cache() memory_before = torch.cuda.memory_allocated() / (1024 * 1024 * 1024) # GB with torch.no_grad(): with torch.cuda.amp.autocast(dtype=dtype): images = images[None] # add batch dimension aggregated_tokens_list, ps_idx = model.aggregator(images) # uncomment below if you hope to include them # Predict Cameras # pose_enc = model.camera_head(aggregated_tokens_list)[-1] # Predict Depth Maps # depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx, frames_chunk_size=1) peak_memory_gb = torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024) memory_used_gb = peak_memory_gb - memory_before print(f"Memory used: {memory_used_gb:.4f} GB")
@jytime Thanks for your reply. The above script does not solve my issue and the 1B model still consumes 7~8G memory given a single input image.
I think in the report they meant additional GPU memory, let me explain: The model has ~5GB, so I think they calculate the memory: used_memory - 5GB. Therefore indicating how much, in addition to the model, the images affect the GPU. Otherwise I can't understand how they manage to squeeze a 5GB model into 1.88GB.