real life metric measurements with Ruicheng/moge-2-vitl-normal
Hi
Getting the point cloud result seems to be larger (2x? more? less?) than real-life distances.
What am I doing wrong?
Thank you!
"""
Google Colab script for video to point cloud conversion using MoGe
Cell 1 - Install dependencies:
!pip install torch torchvision git+https://github.com/microsoft/MoGe.git opencv-contrib-python open3d
Cell 2 - Main script:
"""
# Imports
import cv2
import numpy as np
import open3d as o3d
import torch
from moge.model.v2 import MoGeModel
from google.colab import files
import time
import os
import shutil
import threading
from queue import Queue
from dataclasses import dataclass
from typing import Tuple
import csv
@dataclass
class FrameData:
"""Container for frame data through the pipeline"""
index: int
original: np.ndarray
processed: np.ndarray = None
tensor: torch.Tensor = None
# Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = MoGeModel.from_pretrained("Ruicheng/moge-2-vitl-normal").to(device).half().eval()
# Create/clean output folder
output_folder = 'point_clouds'
if os.path.exists(output_folder):
shutil.rmtree(output_folder)
os.makedirs(output_folder)
# Open CSV file for intrinsics
intrinsics_csv = open('camera_intrinsics.csv', 'w', newline='')
intrinsics_writer = csv.writer(intrinsics_csv)
intrinsics_writer.writerow(['frame_index', 'fx', 'fy', 'cx', 'cy', 'fov_x_deg', 'fov_y_deg'])
# Video
video_path = 'Joe.mp4'
# Parameters
frame_interval_sec = 0.5 # Interval between frames to process (seconds)
scale_percent = 50 # Percentage to downscale frames
min_distance = 1.5 # Minimum distance from camera (meters)
fov_x_degrees = 81.0 # Horizontal field of view in degrees
# Process video
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_skip = int(fps * frame_interval_sec)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Set up queues
gpu_queue = Queue(maxsize=8)
write_queue = Queue()
def preprocess_frame(frame_data: FrameData, scale_percent: int) -> FrameData:
"""Preprocess a single frame: resize and convert to tensor"""
frame = frame_data.original
# Downscale if needed
if scale_percent != 100:
scale = scale_percent / 100
frame = cv2.resize(frame, (int(width*scale), int(height*scale)))
# Convert BGR to RGB
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Create tensor
tensor = torch.from_numpy(rgb/255.0).to(device, dtype=torch.float16).permute(2,0,1)
frame_data.processed = rgb
frame_data.tensor = tensor
return frame_data
def filter_points_gpu(points_gpu: torch.Tensor, mask_gpu: torch.Tensor, rgb_tensor: torch.Tensor,
min_distance: float = 1.5) -> Tuple[np.ndarray, np.ndarray]:
"""Filter point cloud data on GPU before transferring to CPU"""
# Calculate Euclidean distance from camera
distance = torch.sqrt(points_gpu[..., 0]**2 + points_gpu[..., 1]**2 + points_gpu[..., 2]**2)
# Filter points based on:
# 1. MoGe's confidence mask
# 2. Minimum distance from camera
# 3. Valid (finite) values
valid_mask = mask_gpu & (distance > min_distance) & torch.isfinite(points_gpu).all(dim=-1)
# Flatten and extract valid points on GPU
h, w = mask_gpu.shape
points_flat = points_gpu.view(-1, 3)
rgb_flat = rgb_tensor.view(-1, 3)
mask_flat = valid_mask.view(-1)
valid_points_gpu = points_flat[mask_flat]
valid_colors_gpu = rgb_flat[mask_flat]
# Transfer only filtered points to CPU
valid_points = valid_points_gpu.cpu().numpy()
valid_colors = valid_colors_gpu.cpu().numpy()
return valid_points, valid_colors
def writer_thread():
"""Background thread for writing PLY files"""
while True:
item = write_queue.get()
if item is None:
break
filename, points, colors = item
try:
# Create Open3D object here (in background thread)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(points)
pcd.colors = o3d.utility.Vector3dVector(colors / 255.0)
# Write PLY file
o3d.io.write_point_cloud(filename, pcd, write_ascii=False)
except Exception as e:
print(f"Error writing {filename}: {e}")
finally:
write_queue.task_done()
def gpu_inference_thread():
"""GPU inference thread - process one frame at a time"""
frame_count = 0
total_gpu_time = 0
total_time = 0
while True:
frame_data = gpu_queue.get()
if frame_data is None: # Sentinel
gpu_queue.task_done()
break
# Process single frame
start_time = time.time()
# GPU inference with known FOV
with torch.no_grad():
outputs = model.infer(frame_data.tensor, fov_x=fov_x_degrees)
# Get RGB data from input tensor (already on GPU)
rgb_gpu = frame_data.tensor.permute(1, 2, 0) * 255.0
# Filter on GPU before CPU transfer
valid_points, valid_colors = filter_points_gpu(
outputs['points'],
outputs['mask'],
rgb_gpu,
min_distance
)
# Extract and save intrinsics
intrinsics = outputs['intrinsics'].cpu().numpy()
fx, fy = intrinsics[0, 0], intrinsics[1, 1]
cx, cy = intrinsics[0, 2], intrinsics[1, 2]
# Convert intrinsics to FOV (for verification)
h, w = frame_data.tensor.shape[1:]
fov_x_rad = 2 * np.arctan(w / (2 * fx))
fov_y_rad = 2 * np.arctan(h / (2 * fy))
fov_x_deg = np.rad2deg(fov_x_rad)
fov_y_deg = np.rad2deg(fov_y_rad)
# Write to CSV
intrinsics_writer.writerow([frame_data.index, fx, fy, cx, cy, fov_x_deg, fov_y_deg])
intrinsics_csv.flush() # Ensure data is written immediately
# Clean GPU memory
torch.cuda.empty_cache()
gpu_time = time.time() - start_time
# Send filtered data to writer
filename = os.path.join(output_folder, f'frame_{frame_data.index:04d}.ply')
write_queue.put((filename, valid_points, valid_colors))
total_frame_time = time.time() - start_time
# Update statistics
frame_count += 1
total_gpu_time += gpu_time
total_time += total_frame_time
# Print progress every 10 frames
if frame_count % 10 == 0:
avg_gpu = total_gpu_time / frame_count
avg_total = total_time / frame_count
print(f"Processed {frame_count} frames: Avg GPU {avg_gpu:.3f}s, Avg Total {avg_total:.3f}s per frame")
# Also show current GPU memory usage
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3
reserved = torch.cuda.memory_reserved() / 1024**3
print(f" GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
gpu_queue.task_done()
# Final statistics
if frame_count > 0:
print(f"\nGPU thread complete: {frame_count} frames")
print(f"Average GPU time: {total_gpu_time/frame_count:.3f}s per frame")
print(f"Average total time: {total_time/frame_count:.3f}s per frame")
# Signal end to write queue
write_queue.put(None)
# Start threads
writer = threading.Thread(target=writer_thread, daemon=True)
gpu_thread = threading.Thread(target=gpu_inference_thread, daemon=True)
writer.start()
gpu_thread.start()
# Main loop - read video and preprocess frames
frame_idx = 0
processed_count = 0
start_total = time.time()
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx % frame_skip == 0:
# Create frame data
frame_data = FrameData(index=frame_idx, original=frame)
# Preprocess frame
preprocess_frame(frame_data, scale_percent)
# Send to GPU queue
gpu_queue.put(frame_data)
processed_count += 1
frame_idx += 1
# Signal end of stream
gpu_queue.put(None)
# Wait for completion
gpu_thread.join()
writer.join()
# Statistics
total_time = time.time() - start_total
print(f"\nProcessed {processed_count} frames")
print(f"Total time: {total_time:.2f}s")
print(f"Average time per frame: {total_time/processed_count:.3f}s")
print(f"Overall FPS: {processed_count/total_time:.2f}")
cap.release()
intrinsics_csv.close()
print("\nAll files written successfully!")
print("Camera intrinsics saved to: camera_intrinsics.csv")
# Download results (add to a separate cell in Colab)
# !zip -r point_clouds.zip point_clouds/
# files.download('point_clouds.zip')
Also, it seems that Moge2 is ignoring the fov_x=81 input and providing ALWAYS around 179 degrees horizontal FOV. Even for very different input fov_x.
Hi. It looks like you're interpreting the predicted intrinsics as if they were in pixel space, which leads to an incorrect FOV computation—nearly 180 degrees.
You're currently using:
fov_x_rad = 2 * np.arctan(w / (2 * fx))
fov_y_rad = 2 * np.arctan(h / (2 * fy))
However, since the output fx and fy are already normalized by image width and height respectively, the correct computation should be:
fov_x_rad = 2 * np.arctan(1 / (2 * fx))
fov_y_rad = 2 * np.arctan(1 / (2 * fy))
This should give you the correct field of view in radians.
Regarding the real-life metric scale: could you share some examples of your input images? In general, the model produces the most accurate metric scales for typical indoor and street scenes. For unfamiliar objects or extremely close-up views, the predictions may be less reliable due to limited training data in those domains.
Thank you for this. I'll check the intrinsics aspect.
As to the depth - please see attached a photo. All the distances come out about double their real distance from the camera. The end of the lance is about 75 cm from the camera, for example.
Field-of-View (FoV) Diagonal 93°, Horizontal 81°
Camera matrix [[1.09579882e+03 0.00000000e+00 9.50900653e+02] [0.00000000e+00 1.09259800e+03 5.19706072e+02] [0.00000000e+00 0.00000000e+00 1.00000000e+00]]
Distortion coeffs [[-0.06877963 0.08537744 -0.00089973 -0.00070357 -0.01667835]]
I'm sorry, but the model is not expected to predict an accurate scale for this image. Estimating metric scale requires the model to recognize common objects with known size as geometric cues, such as cars, lanes, trees, or regular building. Since this image is synthetic and lacks such familiar references, the model cannot reliably infer scale. In fact, even for a human, it's difficult to estimate the approximate scale of this scene. For better results, we recommend testing the model on real-world scenes containing everyday objects with recognizable sizes.
Thank you for your reply. It really means a lot that you're putting in the time to answer these questions.
The image is not synthetic. It's taken from our drone 😬. But I understand what you're saying about pre training on visual cues. This makes perfect sense.
On Wed, 9 Jul 2025 at 17:19, Ruicheng Wang @.***> wrote:
EasternJournalist left a comment (microsoft/MoGe#75) https://github.com/microsoft/MoGe/issues/75#issuecomment-3051463615
I'm sorry, but the model is not expected to predict an accurate scale for this image. Estimating metric scale requires the model to recognize common objects with known size as geometric cues, such as cars, lanes, trees, or regular building.
Since this image is synthetic and lacks such familiar references, the model cannot reliably infer scale. In fact, even for a human, it's difficult to estimate the approximate scale of this scene. For better results, we recommend testing the model on real-world scenes containing everyday objects with recognizable sizes.
— Reply to this email directly, view it on GitHub https://github.com/microsoft/MoGe/issues/75#issuecomment-3051463615, or unsubscribe https://github.com/notifications/unsubscribe-auth/BTR355HHDLHB53KPUZ76FGT3HS7B5AVCNFSM6AAAAACATBP4MGVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZTANJRGQ3DGNRRGU . You are receiving this because you authored the thread.Message ID: @.***>
--
Gabriel WainmannCTO, SnapTradie
www.snaptradie.com https://www.marine-wellness.com/
@.***
My scene has a car, but when I compare it using moge2 and lidar, there is still a significant difference in scale.
my gt intrins as input