Weird cone-shaped point cloud using mono camera with Depth-Anything-V2 to get the RGB-D image
Hi, I am trying to use Depth-Anything-V2 with mono camera to emulate RGB-D image. I have managed to create the nodes and produce a neat depth image out of it. But I got problem with rtabmap reconstructing the point cloud. It seems make an erratic cone-shaped 3D from the middle of the frame, and expanding deeper to the edges.
I wonder if anyone can help me point out what did I miss.
Launcher file
<launch>
<!-- Single camera setup -->
<node name="camera" pkg="usb_cam" type="usb_cam_node" output="screen">
<param name="video_device" value="/dev/video0" />
<param name="image_width" value="640" />
<param name="image_height" value="480" />
<param name="pixel_format" value="mjpeg" />
<param name="camera_frame_id" value="camera_optical_frame" />
<param name="io_method" value="mmap"/>
<param name="framerate" value="1" />
<param name="camera_info_url" value="file://$(find my_stereo_setup)/ost.yaml"/>
</node>
<!-- Image rectification -->
<node pkg="image_proc" type="image_proc" name="image_proc">
<remap from="image_raw" to="/camera/image_raw"/>
<remap from="camera_info" to="/camera/camera_info"/>
</node>
<!-- Depth-Anything-V2 node -->
<node name="depth_anything_node" pkg="my_stereo_setup" type="depth_anything_node.py" output="screen">
<param name="model_path" value="$(env HOME)/Projects/Depth-Anything-V2/checkpoints/depth_anything_v2_vits.pth"/>
<param name="encoder" value="vits"/>
<param name="calibration_file" value="$(find my_stereo_setup)/ost.yaml"/>
</node>
<!-- RGBD Sync node -->
<node pkg="nodelet" type="nodelet" name="rgbd_sync" args="standalone rtabmap_sync/rgbd_sync" output="screen">
<remap from="rgb/image" to="/camera/rgb/image_rect_color"/>
<remap from="depth/image" to="/camera/depth_registered/image_raw"/>
<remap from="rgb/camera_info" to="/camera/camera_info"/>
<remap from="rgbd_image" to="/rgbd_image"/>
<param name="approx_sync" value="true"/>
<param name="approx_sync_max_interval" value="0.1"/>
<param name="queue_size" value="5"/>
</node>
<node pkg="tf" type="static_transform_publisher" name="odom_to_camera"
args="0 0 0 0 0 0 odom camera_optical_frame 10" />
<!-- RTAB-Map node -->
<include file="$(find rtabmap_launch)/launch/rtabmap.launch">
<arg name="rgbd_topic" value="/rgbd_image"/>
<arg name="camera_info_topic" value="/camera/camera_info"/>
<arg name="frame_id" value="camera_optical_frame"/>
<arg name="odom_frame_id" value="odom"/>
<arg name="approx_sync" value="true"/>
<arg name="queue_size" value="5"/>
<arg name="visual_odometry" value="true"/>
<arg name="wait_for_transform" value="5.0"/>
<arg name="odom_guess_frame_id" value="odom"/>
<arg name="rgbd_depth_scale" value="0.1"/>
<arg name="rviz" value="true"/>
<arg name="rtabmap_args" value="
--delete_db_on_start
--Vis/FeatureType 1
--Vis/MinInliers 10
--Vis/MaxFeatures 1000
--RGBD/ProximityBySpace true
--RGBD/AngularUpdate 0.05
--RGBD/LinearUpdate 0.05
--RGBD/RateThreshold 0.5
--Rtabmap/LoopClosureReactivation 0.01
--Rtabmap/DetectionRate 0.02
--Grid/3D true
--Grid/CellSize 0.05
--Vis/CorrespondenceRatio 0.3
--Mem/ImagePreDecimation 2
--Mem/ImagePostDecimation 2
--Vis/Iterations 1
--publish_tf true
"/>
</include>
<!-- Debug: Image view nodes -->
<node name="image_view_rgb" pkg="image_view" type="image_view" respawn="false" output="screen">
<remap from="image" to="/camera/rgb/image_rect_color"/>
<param name="autosize" value="true" />
</node>
<node name="image_view_depth" pkg="image_view" type="image_view" respawn="false" output="screen">
<remap from="image" to="/camera/depth_registered/image_raw"/>
<param name="autosize" value="true" />
</node>
</launch>
Python Code: depth_anything_node.py
#!/usr/bin/env python3
import rospy
from sensor_msgs.msg import Image, CameraInfo
from cv_bridge import CvBridge, CvBridgeError
import cv2
import torch
import numpy as np
import os
import sys
import yaml
# Add the path to the Depth-Anything-V2 repository
depth_anything_path = os.path.expanduser('~/Projects/Depth-Anything-V2')
sys.path.append(depth_anything_path)
from depth_anything_v2.dpt import DepthAnythingV2
class DepthAnythingNode:
def __init__(self):
self.bridge = CvBridge()
self.rgb_pub = rospy.Publisher("/camera/rgb/image_rect_color", Image, queue_size=1)
self.depth_pub = rospy.Publisher("/camera/depth_registered/image_raw", Image, queue_size=1)
self.camera_info_pub = rospy.Publisher("/camera/camera_info", CameraInfo, queue_size=1)
# Get parameters
self.model_path = rospy.get_param('~model_path')
self.encoder = rospy.get_param('~encoder', 'vits')
self.calibration_file = rospy.get_param('~calibration_file', '~/catkin_ws/src/my_stereo_setup/ost.yaml')
# Load camera calibration
self.camera_info = self.load_camera_info(self.calibration_file)
# Initialize Depth-Anything-V2 model
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
}
self.model = None
rospy.loginfo(f"Initializing Depth-Anything-V2 model with encoder: {self.encoder}")
rospy.loginfo(f"Model path: {self.model_path}")
rospy.loginfo(f"Device: {self.device}")
rospy.loginfo(f"Calibration file: {self.calibration_file}")
self.initialize_model()
# Only subscribe to the image topic if the model is initialized successfully
if self.model is not None:
self.image_sub = rospy.Subscriber("/image_rect_color", Image, self.image_callback)
rospy.loginfo("Depth-Anything-V2 node initialized successfully")
else:
rospy.logerr("Failed to initialize Depth-Anything-V2 node")
def load_camera_info(self, calibration_file):
with open(os.path.expanduser(calibration_file), 'r') as file:
calib_data = yaml.safe_load(file)
camera_info = CameraInfo()
camera_info.width = calib_data['image_width']
camera_info.height = calib_data['image_height']
camera_info.K = calib_data['camera_matrix']['data']
camera_info.D = calib_data['distortion_coefficients']['data']
camera_info.R = calib_data['rectification_matrix']['data']
camera_info.P = calib_data['projection_matrix']['data']
camera_info.distortion_model = calib_data['distortion_model']
return camera_info
def initialize_model(self):
try:
self.model = DepthAnythingV2(**self.model_configs[self.encoder])
if not os.path.exists(self.model_path):
rospy.logerr(f"Model file not found: {self.model_path}")
return
self.model.load_state_dict(torch.load(self.model_path, map_location='cpu'))
self.model = self.model.to(self.device).eval()
rospy.loginfo("Depth-Anything-V2 model loaded successfully")
except Exception as e:
rospy.logerr(f"Error initializing Depth-Anything-V2 model: {str(e)}")
self.model = None
def image_callback(self, msg):
if self.model is None:
rospy.logerr("Model not initialized. Skipping image processing.")
return
current_time = rospy.Time.now()
rospy.loginfo(f"Received image with timestamp: {msg.header.stamp}, current time: {current_time}")
try:
cv_image = self.bridge.imgmsg_to_cv2(msg, "bgr8")
except CvBridgeError as e:
rospy.logerr(e)
return
# Process image with Depth-Anything-V2
with torch.no_grad():
depth = self.model.infer_image(cv_image)
# Normalize depth for visualization (you might want to adjust this)
depth_normalized = cv2.normalize(depth, None, 0, 65535, cv2.NORM_MINMAX)
depth_normalized = depth_normalized.astype(np.uint16) # Convert to 16-bit for ROS
# Use current time for publishing
current_time = rospy.Time.now()
# Publish RGB and depth images
rgb_msg = self.bridge.cv2_to_imgmsg(cv_image, "bgr8")
rgb_msg.header.stamp = current_time
rgb_msg.header.frame_id = "camera_optical_frame"
self.rgb_pub.publish(rgb_msg)
depth_msg = self.bridge.cv2_to_imgmsg(depth_normalized, "16UC1")
depth_msg.header.stamp = current_time
depth_msg.header.frame_id = "camera_optical_frame"
self.depth_pub.publish(depth_msg)
# Publish camera info
self.camera_info.header.stamp = current_time
self.camera_info.header.frame_id = "camera_optical_frame"
self.camera_info_pub.publish(self.camera_info)
rospy.loginfo(f"Published RGB and depth images. RGB shape: {cv_image.shape}, Depth shape: {depth_normalized.shape}")
if __name__ == '__main__':
rospy.init_node('depth_anything_node', anonymous=True)
node = DepthAnythingNode()
rospy.spin()
I just tried to use ORB_SLAM3 RGB-D mode, and it reproduce the similar cone shape. Must be a bug on my implementation code.
Preliminary guess: Should the depth information be in meters? I've used the full range of 16UC1 (0, 65535). Now I've realized there's a metric model for Depth-Anything-V2 aswell. I'm going to try to refactor my code with it.
If you use 16UC1, it should be in mm. If you use 32FC1, it should be in meters. How is the depth scale estimated? If scale is not fixed between the images while the camera is moving, that will make visual odometry fails.