vggt What is the frame sampling strategy of vggt during training?

Hi, thanks for this promising work on 3D reconstruction.

Is there any trick to sample diverse sequences from a monocular video better during the vggt training process? For example, the overlap between two video frames should be precisely in a range, rather than "non-overlap" or "little camera movement".

Apr 02 '25 12:04 guangkaixu

I have the same concerns🤔

How do you sample the correct views during the training stage, especially across different datasets? Do you employ a pair selection strategy similar to DUST3R or Fast3R? If so, how do you handle view sampling in other datasets, such as the Aria Digital Twin? If not, could you kindly share your approach—or even the relevant code, if possible?

Thanks!

Apr 02 '25 12:04 jianglh-WHU

Hi @guangkaixu @jianglh-WHU ,

We didn’t follow the DUST3R strategy. Instead, we compute camera-pose-based similarity between frames using the code below. For each frame, we rank all others by pose similarity and select the top N nearest ones as its valid “range” (N = 512 or 256). Then, for each sequence, we randomly pick one frame as the first frame (query, anchor or any other names) and sample the rest from its valid range.

We’ve visually checked many datasets and this method is stable. It also works very well during training. We chose it because it’s extremely fast and avoids co-visibility concerns altogether, which saves pain in data processing.

import os
import os.path as osp
import numpy as np
import glob
import json
import pdb
import tqdm
def rotation_angle(R1, R2):
    # R1 and R2 are 3x3 rotation matrices
    R = R1.T @ R2
    # Numerical stability: clamp values into [-1,1]
    val = (np.trace(R) - 1) / 2
    val = np.clip(val, -1.0, 1.0)
    angle_rad = np.arccos(val)
    angle_deg = np.degrees(angle_rad)  # Convert radians to degrees
    return angle_deg
def extrinsic_distance(extrinsic1, extrinsic2, lambda_t=1.0):
    R1, t1 = extrinsic1[:3, :3], extrinsic1[:3, 3]
    R2, t2 = extrinsic2[:3, :3], extrinsic2[:3, 3]
    rot_diff = rotation_angle(R1, R2) / 180
    
    center_diff = np.linalg.norm(t1 - t2)
    return rot_diff + lambda_t * center_diff
def rotation_angle_batch(R1, R2):
    # R1, R2: shape (N, 3, 3)
    # We want a matrix of rotation angles for all pairs.
    # We'll get R1^T R2 for each pair.
    # Expand dimensions to broadcast: 
    # R1^T: (N,3,3) -> (N,1,3,3)
    # R2: (N,3,3) -> (1,N,3,3)
    R1_t = np.transpose(R1, (0, 2, 1))[:, np.newaxis, :, :]  # shape (N,1,3,3)
    R2_b = R2[np.newaxis, :, :, :]                          # shape (1,N,3,3)
    R_mult = np.matmul(R1_t, R2_b)  # shape (N,N,3,3)
    # trace(R) for each pair
    trace_vals = R_mult[..., 0, 0] + R_mult[..., 1, 1] + R_mult[..., 2, 2]  # (N,N)
    val = (trace_vals - 1) / 2
    val = np.clip(val, -1.0, 1.0)
    angle_rad = np.arccos(val)
    angle_deg = np.degrees(angle_rad)
    return angle_deg / 180.0  # normalized rotation difference
def extrinsic_distance_batch(extrinsics, lambda_t=1.0):
    # extrinsics: (N,4,4)
    # Extract rotation and translation
    R = extrinsics[:, :3, :3]  # (N,3,3)
    t = extrinsics[:, :3, 3]   # (N,3)
    # Compute all pairwise rotation differences
    rot_diff = rotation_angle_batch(R, R)  # (N,N)
    # Compute all pairwise translation differences
    # For t, shape (N,3). We want all pair differences: t[i] - t[j].
    # t_i: (N,1,3), t_j: (1,N,3)
    t_i = t[:, np.newaxis, :]  # (N,1,3)
    t_j = t[np.newaxis, :, :]  # (1,N,3)
    trans_diff = np.linalg.norm(t_i - t_j, axis=2)  # (N,N)
    dists = rot_diff + lambda_t * trans_diff
    return dists
def rotation_angle_batch_chunked(R, chunk_size):
    N = R.shape[0]
    rot_diff = np.empty((N, N), dtype=np.float32)
    # Precompute R transpose once
    R_t = R.transpose(0,2,1)
    
    for i_start in range(0, N, chunk_size):
        i_end = min(N, i_start + chunk_size)
        # Sub-block of R_t
        R_i_t = R_t[i_start:i_end]  # (B,3,3)
        
        for j_start in range(0, N, chunk_size):
            j_end = min(N, j_start + chunk_size)
            R_j = R[j_start:j_end]   # (B,3,3)
            # Compute R_i_t @ R_j for block
            # R_i_t: (B,3,3)
            # R_j:   (B,3,3) but we need pairwise, so we expand dims
            # This still can be large. If even BxB is too big, choose smaller chunks.
            # shape (B,B,3,3)
            R_mult = R_i_t[:, np.newaxis, :, :] @ R_j[np.newaxis, :, :, :]
            # Compute trace
            trace_vals = R_mult[...,0,0] + R_mult[...,1,1] + R_mult[...,2,2]
            val = (trace_vals - 1.0) / 2.0
            np.clip(val, -1.0, 1.0, out=val)
            angle_rad = np.arccos(val)
            angle_deg = np.degrees(angle_rad)
            block_rot_diff = angle_deg / 180.0
            rot_diff[i_start:i_end, j_start:j_end] = block_rot_diff.astype(np.float32)
    return rot_diff
def extrinsic_distance_batch_chunked(extrinsics, lambda_t=1.0, chunk_size=1000):
    R = extrinsics[:, :3, :3].astype(np.float32)
    t = extrinsics[:, :3, 3].astype(np.float32)
    N = R.shape[0]
    # Compute rotation differences in chunks
    rot_diff = rotation_angle_batch_chunked(R, chunk_size)
    # Compute translation differences in chunks
    dists = np.empty((N, N), dtype=np.float32)
    for i_start in range(0, N, chunk_size):
        i_end = min(N, i_start + chunk_size)
        t_i = t[i_start:i_end]  # (B,3)
        for j_start in range(0, N, chunk_size):
            j_end = min(N, j_start + chunk_size)
            t_j = t[j_start:j_end]  # (B,3)
            
            # broadcasting: (B,1,3) - (1,B,3) => (B,B,3)
            diff = t_i[:, None, :] - t_j[None, :, :]
            trans_diff = np.linalg.norm(diff, axis=2)  # (B,B)
            
            # Add rotation and translation
            dists[i_start:i_end, j_start:j_end] = rot_diff[i_start:i_end, j_start:j_end] + lambda_t * trans_diff
    return dists
def compute_ranking(extrinsics, lambda_t=1.0, normalize=True, batched=True):
    
    if normalize:
        extrinsics = np.copy(extrinsics)
        camera_center = np.copy(extrinsics[:, :3, 3])
        camera_center_scale = np.linalg.norm(camera_center, axis=1)
        avg_scale = np.mean(camera_center_scale)
        extrinsics[:, :3, 3] = extrinsics[:, :3, 3] / avg_scale
    
    
    if batched:
        if len(extrinsics) > 6000:
            dists = extrinsic_distance_batch_chunked(extrinsics, lambda_t=lambda_t)
        else:
            dists = extrinsic_distance_batch(extrinsics, lambda_t=lambda_t)
    else:
        N = extrinsics.shape[0]
        dists = np.zeros((N, N))
        for i in range(N):
            for j in range(N):
                dists[i,j] = extrinsic_distance(extrinsics[i], extrinsics[j], lambda_t=lambda_t)
    ranking = np.argsort(dists, axis=1)
    return ranking, dists

Apr 02 '25 12:04 jytime

Hi, @jytime could you please explain more about how do you manage to do this? In my understanding, within each batch, there are special camera tokens and register tokens for each anchor frame, how do you compose them in the dataloader?

Apr 02 '25 13:04 Livioni

Hi @Livioni ,

Camera tokens and register tokens are learnable tokens, they are defined inside the network. They will aggregate the information according to the input frames, and do not need special preparation in the data processing stage. You can check our implementation here:

https://github.com/facebookresearch/vggt/blob/b02cc03ceee70821ed1231a530c1992507ef9862/vggt/models/aggregator.py#L125

Check this paper for a better understadning:

https://arxiv.org/abs/2309.16588

Apr 02 '25 23:04 jytime

@jytime Thanks for the reply, I mean how to specify the location of each anchor frame in one batch during training? For example, batch #_1 i have [2,8,14,24] 48 frames totally, the anchor frames are #_1, #_3, #_11, and # 25, the next time I have [24, 24] the anchor frames are #_1 and #_25, how dou you handle that?

Apr 03 '25 01:04 Livioni

Also, in this case, will the batch images from different scenes pass through global attention during batch training? In my understanding, such attention is inherently wired, as different scenes share neither co-visible areas nor meaningful connections.

Apr 03 '25 11:04 lifuguan

Hi @Livioni could you elaborate more about this? Sorry I am still confused about the question here.

Hi @lifuguan , no, only those images from one scene (sequence/image collection/...) will go to global attention together. If you look at the code of aggregator, we have a batch dimension B and a frame dimension S, so the input is BxSx3xHxW

Apr 03 '25 14:04 jytime

Hi @Livioni could you elaborate more about this? Sorry I am still confused about the question here.

Hi @lifuguan , no, only those images from one scene (sequence/image collection/...) will go to global attention together. If you look at the code of aggregator, we have a batch dimension B and a frame dimension S, so the input is BxSx3xHxW

But according to the paper "between 2 and 24 frames per scene while maintaining the constant total of 48 frames within each batch.", (S = 48) are collected by multiple scenes? For example, 2 images from scene A, 6 images from scene B, 16 images from scene C, 24 images from scene C. If there is any misunderstanding, please correct me.🙏

Apr 03 '25 14:04 lifuguan

It should be BxS ~= 48, and S is a random number between 2-24. For example, if S=12, B=4. If S=7, B=6.

Apr 03 '25 14:04 jytime

I see! Brilliant! Thanks so much!

Apr 03 '25 14:04 lifuguan

And to clarify, the different sequences in a batch (of the same length) are sampled from different scenes of the same dataset randomly?

Apr 05 '25 22:04 davnords

HI @davnords , they are sampled from different scenes, while not necessary to be from the same dataset.

I am cleaning the training code now, hope to share more this week.

Apr 07 '25 00:04 jytime

Hi @guangkaixu @jianglh-WHU ,

We didn’t follow the DUST3R strategy. Instead, we compute camera-pose-based similarity between frames using the code below. For each frame, we rank all others by pose similarity and select the top N nearest ones as its valid “range” (N = 512 or 256). Then, for each sequence, we randomly pick one frame as the first frame (query, anchor or any other names) and sample the rest from its valid range.

We’ve visually checked many datasets and this method is stable. It also works very well during training. We chose it because it’s extremely fast and avoids co-visibility concerns altogether, which saves pain in data processing.

import os
import os.path as osp
import numpy as np
import glob
import json
import pdb
import tqdm
def rotation_angle(R1, R2):
    # R1 and R2 are 3x3 rotation matrices
    R = R1.T @ R2
    # Numerical stability: clamp values into [-1,1]
    val = (np.trace(R) - 1) / 2
    val = np.clip(val, -1.0, 1.0)
    angle_rad = np.arccos(val)
    angle_deg = np.degrees(angle_rad)  # Convert radians to degrees
    return angle_deg
def extrinsic_distance(extrinsic1, extrinsic2, lambda_t=1.0):
    R1, t1 = extrinsic1[:3, :3], extrinsic1[:3, 3]
    R2, t2 = extrinsic2[:3, :3], extrinsic2[:3, 3]
    rot_diff = rotation_angle(R1, R2) / 180
    
    center_diff = np.linalg.norm(t1 - t2)
    return rot_diff + lambda_t * center_diff
def rotation_angle_batch(R1, R2):
    # R1, R2: shape (N, 3, 3)
    # We want a matrix of rotation angles for all pairs.
    # We'll get R1^T R2 for each pair.
    # Expand dimensions to broadcast: 
    # R1^T: (N,3,3) -> (N,1,3,3)
    # R2: (N,3,3) -> (1,N,3,3)
    R1_t = np.transpose(R1, (0, 2, 1))[:, np.newaxis, :, :]  # shape (N,1,3,3)
    R2_b = R2[np.newaxis, :, :, :]                          # shape (1,N,3,3)
    R_mult = np.matmul(R1_t, R2_b)  # shape (N,N,3,3)
    # trace(R) for each pair
    trace_vals = R_mult[..., 0, 0] + R_mult[..., 1, 1] + R_mult[..., 2, 2]  # (N,N)
    val = (trace_vals - 1) / 2
    val = np.clip(val, -1.0, 1.0)
    angle_rad = np.arccos(val)
    angle_deg = np.degrees(angle_rad)
    return angle_deg / 180.0  # normalized rotation difference
def extrinsic_distance_batch(extrinsics, lambda_t=1.0):
    # extrinsics: (N,4,4)
    # Extract rotation and translation
    R = extrinsics[:, :3, :3]  # (N,3,3)
    t = extrinsics[:, :3, 3]   # (N,3)
    # Compute all pairwise rotation differences
    rot_diff = rotation_angle_batch(R, R)  # (N,N)
    # Compute all pairwise translation differences
    # For t, shape (N,3). We want all pair differences: t[i] - t[j].
    # t_i: (N,1,3), t_j: (1,N,3)
    t_i = t[:, np.newaxis, :]  # (N,1,3)
    t_j = t[np.newaxis, :, :]  # (1,N,3)
    trans_diff = np.linalg.norm(t_i - t_j, axis=2)  # (N,N)
    dists = rot_diff + lambda_t * trans_diff
    return dists
def rotation_angle_batch_chunked(R, chunk_size):
    N = R.shape[0]
    rot_diff = np.empty((N, N), dtype=np.float32)
    # Precompute R transpose once
    R_t = R.transpose(0,2,1)
    
    for i_start in range(0, N, chunk_size):
        i_end = min(N, i_start + chunk_size)
        # Sub-block of R_t
        R_i_t = R_t[i_start:i_end]  # (B,3,3)
        
        for j_start in range(0, N, chunk_size):
            j_end = min(N, j_start + chunk_size)
            R_j = R[j_start:j_end]   # (B,3,3)
            # Compute R_i_t @ R_j for block
            # R_i_t: (B,3,3)
            # R_j:   (B,3,3) but we need pairwise, so we expand dims
            # This still can be large. If even BxB is too big, choose smaller chunks.
            # shape (B,B,3,3)
            R_mult = R_i_t[:, np.newaxis, :, :] @ R_j[np.newaxis, :, :, :]
            # Compute trace
            trace_vals = R_mult[...,0,0] + R_mult[...,1,1] + R_mult[...,2,2]
            val = (trace_vals - 1.0) / 2.0
            np.clip(val, -1.0, 1.0, out=val)
            angle_rad = np.arccos(val)
            angle_deg = np.degrees(angle_rad)
            block_rot_diff = angle_deg / 180.0
            rot_diff[i_start:i_end, j_start:j_end] = block_rot_diff.astype(np.float32)
    return rot_diff
def extrinsic_distance_batch_chunked(extrinsics, lambda_t=1.0, chunk_size=1000):
    R = extrinsics[:, :3, :3].astype(np.float32)
    t = extrinsics[:, :3, 3].astype(np.float32)
    N = R.shape[0]
    # Compute rotation differences in chunks
    rot_diff = rotation_angle_batch_chunked(R, chunk_size)
    # Compute translation differences in chunks
    dists = np.empty((N, N), dtype=np.float32)
    for i_start in range(0, N, chunk_size):
        i_end = min(N, i_start + chunk_size)
        t_i = t[i_start:i_end]  # (B,3)
        for j_start in range(0, N, chunk_size):
            j_end = min(N, j_start + chunk_size)
            t_j = t[j_start:j_end]  # (B,3)
            
            # broadcasting: (B,1,3) - (1,B,3) => (B,B,3)
            diff = t_i[:, None, :] - t_j[None, :, :]
            trans_diff = np.linalg.norm(diff, axis=2)  # (B,B)
            
            # Add rotation and translation
            dists[i_start:i_end, j_start:j_end] = rot_diff[i_start:i_end, j_start:j_end] + lambda_t * trans_diff
    return dists
def compute_ranking(extrinsics, lambda_t=1.0, normalize=True, batched=True):
    
    if normalize:
        extrinsics = np.copy(extrinsics)
        camera_center = np.copy(extrinsics[:, :3, 3])
        camera_center_scale = np.linalg.norm(camera_center, axis=1)
        avg_scale = np.mean(camera_center_scale)
        extrinsics[:, :3, 3] = extrinsics[:, :3, 3] / avg_scale
    
    
    if batched:
        if len(extrinsics) > 6000:
            dists = extrinsic_distance_batch_chunked(extrinsics, lambda_t=lambda_t)
        else:
            dists = extrinsic_distance_batch(extrinsics, lambda_t=lambda_t)
    else:
        N = extrinsics.shape[0]
        dists = np.zeros((N, N))
        for i in range(N):
            for j in range(N):
                dists[i,j] = extrinsic_distance(extrinsics[i], extrinsics[j], lambda_t=lambda_t)
    ranking = np.argsort(dists, axis=1)
    return ranking, dists

Hi, Jianyuan, can I get the value of lambda_t to balance rotation and translation, or it is just the 1.0?

Jul 19 '25 12:07 zhangshuoneu