What is the frame sampling strategy of vggt during training?
Hi, thanks for this promising work on 3D reconstruction.
Is there any trick to sample diverse sequences from a monocular video better during the vggt training process? For example, the overlap between two video frames should be precisely in a range, rather than "non-overlap" or "little camera movement".
I have the same concerns🤔
How do you sample the correct views during the training stage, especially across different datasets? Do you employ a pair selection strategy similar to DUST3R or Fast3R? If so, how do you handle view sampling in other datasets, such as the Aria Digital Twin? If not, could you kindly share your approach—or even the relevant code, if possible?
Thanks!
Hi @guangkaixu @jianglh-WHU ,
We didn’t follow the DUST3R strategy. Instead, we compute camera-pose-based similarity between frames using the code below. For each frame, we rank all others by pose similarity and select the top N nearest ones as its valid “range” (N = 512 or 256). Then, for each sequence, we randomly pick one frame as the first frame (query, anchor or any other names) and sample the rest from its valid range.
We’ve visually checked many datasets and this method is stable. It also works very well during training. We chose it because it’s extremely fast and avoids co-visibility concerns altogether, which saves pain in data processing.
import os
import os.path as osp
import numpy as np
import glob
import json
import pdb
import tqdm
def rotation_angle(R1, R2):
# R1 and R2 are 3x3 rotation matrices
R = R1.T @ R2
# Numerical stability: clamp values into [-1,1]
val = (np.trace(R) - 1) / 2
val = np.clip(val, -1.0, 1.0)
angle_rad = np.arccos(val)
angle_deg = np.degrees(angle_rad) # Convert radians to degrees
return angle_deg
def extrinsic_distance(extrinsic1, extrinsic2, lambda_t=1.0):
R1, t1 = extrinsic1[:3, :3], extrinsic1[:3, 3]
R2, t2 = extrinsic2[:3, :3], extrinsic2[:3, 3]
rot_diff = rotation_angle(R1, R2) / 180
center_diff = np.linalg.norm(t1 - t2)
return rot_diff + lambda_t * center_diff
def rotation_angle_batch(R1, R2):
# R1, R2: shape (N, 3, 3)
# We want a matrix of rotation angles for all pairs.
# We'll get R1^T R2 for each pair.
# Expand dimensions to broadcast:
# R1^T: (N,3,3) -> (N,1,3,3)
# R2: (N,3,3) -> (1,N,3,3)
R1_t = np.transpose(R1, (0, 2, 1))[:, np.newaxis, :, :] # shape (N,1,3,3)
R2_b = R2[np.newaxis, :, :, :] # shape (1,N,3,3)
R_mult = np.matmul(R1_t, R2_b) # shape (N,N,3,3)
# trace(R) for each pair
trace_vals = R_mult[..., 0, 0] + R_mult[..., 1, 1] + R_mult[..., 2, 2] # (N,N)
val = (trace_vals - 1) / 2
val = np.clip(val, -1.0, 1.0)
angle_rad = np.arccos(val)
angle_deg = np.degrees(angle_rad)
return angle_deg / 180.0 # normalized rotation difference
def extrinsic_distance_batch(extrinsics, lambda_t=1.0):
# extrinsics: (N,4,4)
# Extract rotation and translation
R = extrinsics[:, :3, :3] # (N,3,3)
t = extrinsics[:, :3, 3] # (N,3)
# Compute all pairwise rotation differences
rot_diff = rotation_angle_batch(R, R) # (N,N)
# Compute all pairwise translation differences
# For t, shape (N,3). We want all pair differences: t[i] - t[j].
# t_i: (N,1,3), t_j: (1,N,3)
t_i = t[:, np.newaxis, :] # (N,1,3)
t_j = t[np.newaxis, :, :] # (1,N,3)
trans_diff = np.linalg.norm(t_i - t_j, axis=2) # (N,N)
dists = rot_diff + lambda_t * trans_diff
return dists
def rotation_angle_batch_chunked(R, chunk_size):
N = R.shape[0]
rot_diff = np.empty((N, N), dtype=np.float32)
# Precompute R transpose once
R_t = R.transpose(0,2,1)
for i_start in range(0, N, chunk_size):
i_end = min(N, i_start + chunk_size)
# Sub-block of R_t
R_i_t = R_t[i_start:i_end] # (B,3,3)
for j_start in range(0, N, chunk_size):
j_end = min(N, j_start + chunk_size)
R_j = R[j_start:j_end] # (B,3,3)
# Compute R_i_t @ R_j for block
# R_i_t: (B,3,3)
# R_j: (B,3,3) but we need pairwise, so we expand dims
# This still can be large. If even BxB is too big, choose smaller chunks.
# shape (B,B,3,3)
R_mult = R_i_t[:, np.newaxis, :, :] @ R_j[np.newaxis, :, :, :]
# Compute trace
trace_vals = R_mult[...,0,0] + R_mult[...,1,1] + R_mult[...,2,2]
val = (trace_vals - 1.0) / 2.0
np.clip(val, -1.0, 1.0, out=val)
angle_rad = np.arccos(val)
angle_deg = np.degrees(angle_rad)
block_rot_diff = angle_deg / 180.0
rot_diff[i_start:i_end, j_start:j_end] = block_rot_diff.astype(np.float32)
return rot_diff
def extrinsic_distance_batch_chunked(extrinsics, lambda_t=1.0, chunk_size=1000):
R = extrinsics[:, :3, :3].astype(np.float32)
t = extrinsics[:, :3, 3].astype(np.float32)
N = R.shape[0]
# Compute rotation differences in chunks
rot_diff = rotation_angle_batch_chunked(R, chunk_size)
# Compute translation differences in chunks
dists = np.empty((N, N), dtype=np.float32)
for i_start in range(0, N, chunk_size):
i_end = min(N, i_start + chunk_size)
t_i = t[i_start:i_end] # (B,3)
for j_start in range(0, N, chunk_size):
j_end = min(N, j_start + chunk_size)
t_j = t[j_start:j_end] # (B,3)
# broadcasting: (B,1,3) - (1,B,3) => (B,B,3)
diff = t_i[:, None, :] - t_j[None, :, :]
trans_diff = np.linalg.norm(diff, axis=2) # (B,B)
# Add rotation and translation
dists[i_start:i_end, j_start:j_end] = rot_diff[i_start:i_end, j_start:j_end] + lambda_t * trans_diff
return dists
def compute_ranking(extrinsics, lambda_t=1.0, normalize=True, batched=True):
if normalize:
extrinsics = np.copy(extrinsics)
camera_center = np.copy(extrinsics[:, :3, 3])
camera_center_scale = np.linalg.norm(camera_center, axis=1)
avg_scale = np.mean(camera_center_scale)
extrinsics[:, :3, 3] = extrinsics[:, :3, 3] / avg_scale
if batched:
if len(extrinsics) > 6000:
dists = extrinsic_distance_batch_chunked(extrinsics, lambda_t=lambda_t)
else:
dists = extrinsic_distance_batch(extrinsics, lambda_t=lambda_t)
else:
N = extrinsics.shape[0]
dists = np.zeros((N, N))
for i in range(N):
for j in range(N):
dists[i,j] = extrinsic_distance(extrinsics[i], extrinsics[j], lambda_t=lambda_t)
ranking = np.argsort(dists, axis=1)
return ranking, dists
Hi, @jytime could you please explain more about how do you manage to do this? In my understanding, within each batch, there are special camera tokens and register tokens for each anchor frame, how do you compose them in the dataloader?
Hi @Livioni ,
Camera tokens and register tokens are learnable tokens, they are defined inside the network. They will aggregate the information according to the input frames, and do not need special preparation in the data processing stage. You can check our implementation here:
https://github.com/facebookresearch/vggt/blob/b02cc03ceee70821ed1231a530c1992507ef9862/vggt/models/aggregator.py#L125
Check this paper for a better understadning:
https://arxiv.org/abs/2309.16588
@jytime Thanks for the reply, I mean how to specify the location of each anchor frame in one batch during training? For example, batch #_1 i have [2,8,14,24] 48 frames totally, the anchor frames are #_1, #_3, #_11, and # 25, the next time I have [24, 24] the anchor frames are #_1 and #_25, how dou you handle that?
Also, in this case, will the batch images from different scenes pass through global attention during batch training? In my understanding, such attention is inherently wired, as different scenes share neither co-visible areas nor meaningful connections.
Hi @Livioni could you elaborate more about this? Sorry I am still confused about the question here.
Hi @lifuguan , no, only those images from one scene (sequence/image collection/...) will go to global attention together. If you look at the code of aggregator, we have a batch dimension B and a frame dimension S, so the input is BxSx3xHxW
Hi @Livioni could you elaborate more about this? Sorry I am still confused about the question here.
Hi @lifuguan , no, only those images from one scene (sequence/image collection/...) will go to global attention together. If you look at the code of aggregator, we have a batch dimension B and a frame dimension S, so the input is BxSx3xHxW
But according to the paper "between 2 and 24 frames per scene while maintaining the constant total of 48 frames within each batch.", (S = 48) are collected by multiple scenes? For example, 2 images from scene A, 6 images from scene B, 16 images from scene C, 24 images from scene C. If there is any misunderstanding, please correct me.🙏
It should be BxS ~= 48, and S is a random number between 2-24. For example, if S=12, B=4. If S=7, B=6.
I see! Brilliant! Thanks so much!
And to clarify, the different sequences in a batch (of the same length) are sampled from different scenes of the same dataset randomly?
HI @davnords , they are sampled from different scenes, while not necessary to be from the same dataset.
I am cleaning the training code now, hope to share more this week.
Hi @guangkaixu @jianglh-WHU ,
We didn’t follow the DUST3R strategy. Instead, we compute camera-pose-based similarity between frames using the code below. For each frame, we rank all others by pose similarity and select the top N nearest ones as its valid “range” (N = 512 or 256). Then, for each sequence, we randomly pick one frame as the first frame (query, anchor or any other names) and sample the rest from its valid range.
We’ve visually checked many datasets and this method is stable. It also works very well during training. We chose it because it’s extremely fast and avoids co-visibility concerns altogether, which saves pain in data processing.
import os import os.path as osp import numpy as np import glob import json import pdb import tqdm def rotation_angle(R1, R2): # R1 and R2 are 3x3 rotation matrices R = R1.T @ R2 # Numerical stability: clamp values into [-1,1] val = (np.trace(R) - 1) / 2 val = np.clip(val, -1.0, 1.0) angle_rad = np.arccos(val) angle_deg = np.degrees(angle_rad) # Convert radians to degrees return angle_deg def extrinsic_distance(extrinsic1, extrinsic2, lambda_t=1.0): R1, t1 = extrinsic1[:3, :3], extrinsic1[:3, 3] R2, t2 = extrinsic2[:3, :3], extrinsic2[:3, 3] rot_diff = rotation_angle(R1, R2) / 180 center_diff = np.linalg.norm(t1 - t2) return rot_diff + lambda_t * center_diff def rotation_angle_batch(R1, R2): # R1, R2: shape (N, 3, 3) # We want a matrix of rotation angles for all pairs. # We'll get R1^T R2 for each pair. # Expand dimensions to broadcast: # R1^T: (N,3,3) -> (N,1,3,3) # R2: (N,3,3) -> (1,N,3,3) R1_t = np.transpose(R1, (0, 2, 1))[:, np.newaxis, :, :] # shape (N,1,3,3) R2_b = R2[np.newaxis, :, :, :] # shape (1,N,3,3) R_mult = np.matmul(R1_t, R2_b) # shape (N,N,3,3) # trace(R) for each pair trace_vals = R_mult[..., 0, 0] + R_mult[..., 1, 1] + R_mult[..., 2, 2] # (N,N) val = (trace_vals - 1) / 2 val = np.clip(val, -1.0, 1.0) angle_rad = np.arccos(val) angle_deg = np.degrees(angle_rad) return angle_deg / 180.0 # normalized rotation difference def extrinsic_distance_batch(extrinsics, lambda_t=1.0): # extrinsics: (N,4,4) # Extract rotation and translation R = extrinsics[:, :3, :3] # (N,3,3) t = extrinsics[:, :3, 3] # (N,3) # Compute all pairwise rotation differences rot_diff = rotation_angle_batch(R, R) # (N,N) # Compute all pairwise translation differences # For t, shape (N,3). We want all pair differences: t[i] - t[j]. # t_i: (N,1,3), t_j: (1,N,3) t_i = t[:, np.newaxis, :] # (N,1,3) t_j = t[np.newaxis, :, :] # (1,N,3) trans_diff = np.linalg.norm(t_i - t_j, axis=2) # (N,N) dists = rot_diff + lambda_t * trans_diff return dists def rotation_angle_batch_chunked(R, chunk_size): N = R.shape[0] rot_diff = np.empty((N, N), dtype=np.float32) # Precompute R transpose once R_t = R.transpose(0,2,1) for i_start in range(0, N, chunk_size): i_end = min(N, i_start + chunk_size) # Sub-block of R_t R_i_t = R_t[i_start:i_end] # (B,3,3) for j_start in range(0, N, chunk_size): j_end = min(N, j_start + chunk_size) R_j = R[j_start:j_end] # (B,3,3) # Compute R_i_t @ R_j for block # R_i_t: (B,3,3) # R_j: (B,3,3) but we need pairwise, so we expand dims # This still can be large. If even BxB is too big, choose smaller chunks. # shape (B,B,3,3) R_mult = R_i_t[:, np.newaxis, :, :] @ R_j[np.newaxis, :, :, :] # Compute trace trace_vals = R_mult[...,0,0] + R_mult[...,1,1] + R_mult[...,2,2] val = (trace_vals - 1.0) / 2.0 np.clip(val, -1.0, 1.0, out=val) angle_rad = np.arccos(val) angle_deg = np.degrees(angle_rad) block_rot_diff = angle_deg / 180.0 rot_diff[i_start:i_end, j_start:j_end] = block_rot_diff.astype(np.float32) return rot_diff def extrinsic_distance_batch_chunked(extrinsics, lambda_t=1.0, chunk_size=1000): R = extrinsics[:, :3, :3].astype(np.float32) t = extrinsics[:, :3, 3].astype(np.float32) N = R.shape[0] # Compute rotation differences in chunks rot_diff = rotation_angle_batch_chunked(R, chunk_size) # Compute translation differences in chunks dists = np.empty((N, N), dtype=np.float32) for i_start in range(0, N, chunk_size): i_end = min(N, i_start + chunk_size) t_i = t[i_start:i_end] # (B,3) for j_start in range(0, N, chunk_size): j_end = min(N, j_start + chunk_size) t_j = t[j_start:j_end] # (B,3) # broadcasting: (B,1,3) - (1,B,3) => (B,B,3) diff = t_i[:, None, :] - t_j[None, :, :] trans_diff = np.linalg.norm(diff, axis=2) # (B,B) # Add rotation and translation dists[i_start:i_end, j_start:j_end] = rot_diff[i_start:i_end, j_start:j_end] + lambda_t * trans_diff return dists def compute_ranking(extrinsics, lambda_t=1.0, normalize=True, batched=True): if normalize: extrinsics = np.copy(extrinsics) camera_center = np.copy(extrinsics[:, :3, 3]) camera_center_scale = np.linalg.norm(camera_center, axis=1) avg_scale = np.mean(camera_center_scale) extrinsics[:, :3, 3] = extrinsics[:, :3, 3] / avg_scale if batched: if len(extrinsics) > 6000: dists = extrinsic_distance_batch_chunked(extrinsics, lambda_t=lambda_t) else: dists = extrinsic_distance_batch(extrinsics, lambda_t=lambda_t) else: N = extrinsics.shape[0] dists = np.zeros((N, N)) for i in range(N): for j in range(N): dists[i,j] = extrinsic_distance(extrinsics[i], extrinsics[j], lambda_t=lambda_t) ranking = np.argsort(dists, axis=1) return ranking, dists
Hi, Jianyuan, can I get the value of lambda_t to balance rotation and translation, or it is just the 1.0?