monosdf realworld reconstruction using colmap

Hi, thanks for the wonferful work!

I want to reconstruct the real indoor scene and use colmap to recover the pose, but the reconstruction results are very poor，how can i solve it?

Thank you very much!

May 22 '23 09:05 LLLLLLLianHua

Hi, I guess more details are needed for me to understand you problem.

May 22 '23 16:05 niujinshuchong

Much appreciated your reply! I wanted to reconstruct a real indoor scene, about 15 square meters, use colmap get image pose,and use following code to process data

import numpy as np
import cv2
import torch
import os
from scipy.spatial.transform import Slerp
from scipy.interpolate import interp1d
from scipy.spatial.transform import Rotation as R
import json
import trimesh
import argparse
import glob
import math
import PIL
import ipdb
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
from torch.nn import functional as F


scenes = ['lab_huang']
out_names = ['scan1']


def qvec2rotmat(qvec):
	return np.array([
		[
			1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
			2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
			2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
		], [
			2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
			1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
			2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
		], [
			2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
			2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
			1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
		]
	])

def get_pose_colmap(args):
    print("get image extrinsic and intrinsic data from colmap results")

    for scene, out_name in zip(scenes, out_names):

        TEXT_FOLDER = os.path.join(args.data_root, scene, 'sparse', '0')
        with open(os.path.join(TEXT_FOLDER, "cameras.txt"), "r") as f:
            angle_x = math.pi / 2
            for line in f:
                # 1 SIMPLE_RADIAL 2048 1536 1580.46 1024 768 0.0045691
                # 1 OPENCV 3840 2160 3178.27 3182.09 1920 1080 0.159668 -0.231286 -0.00123982 0.00272224
                # 1 RADIAL 1920 1080 1665.1 960 540 0.0672856 -0.0761443
                if line[0] == "#":
                    continue
                els = line.split(" ")
                w = float(els[2])
                h = float(els[3])
                fl_x = float(els[4])
                fl_y = float(els[4])
                k1 = 0
                k2 = 0
                p1 = 0
                p2 = 0
                cx = w / 2
                cy = h / 2
                if els[1] == "SIMPLE_PINHOLE":
                    cx = float(els[5])
                    cy = float(els[6])
                elif els[1] == "PINHOLE":
                    fl_y = float(els[5])
                    cx = float(els[6])
                    cy = float(els[7])
                elif els[1] == "SIMPLE_RADIAL":
                    cx = float(els[5])
                    cy = float(els[6])
                    k1 = float(els[7])
                elif els[1] == "RADIAL":
                    cx = float(els[5])
                    cy = float(els[6])
                    k1 = float(els[7])
                    k2 = float(els[8])
                elif els[1] == "OPENCV":
                    fl_y = float(els[5])
                    cx = float(els[6])
                    cy = float(els[7])
                    k1 = float(els[8])
                    k2 = float(els[9])
                    p1 = float(els[10])
                    p2 = float(els[11])
                else:
                    print("unknown camera model ", els[1])
                angle_x = math.atan(w / (fl_x * 2)) * 2
                angle_y = math.atan(h / (fl_y * 2)) * 2
                fovx = angle_x * 180 / math.pi
                fovy = angle_y * 180 / math.pi

        print(f"camera:\n\tres={w,h}\n\tcenter={cx,cy}\n\tfocal={fl_x,fl_y}\n\tfov={fovx,fovy}\n\tk={k1,k2} p={p1,p2} ")
        intrinsic = np.array([[fl_x, 0, cx],
                            [0, fl_y, cy],
                            [0, 0, 1]])

        print(intrinsic)

        with open(os.path.join(TEXT_FOLDER, "images.txt"), "r") as f:
            i = 0

            bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4])

            extrinsics = []

            for line in f:
                line = line.strip()

                if line[0] == "#":
                    continue

                i = i + 1

                if i % 2 == 1:
                    elems = line.split(" ")  # 1-4 is quat,5-7 is trans, 9ff is filename (9, if filename contains no spaces)

                    name = '_'.join(elems[9:])
                    full_name = os.path.join(args.images, name)

                    qvec = np.array(tuple(map(float, elems[1:5])))
                    tvec = np.array(tuple(map(float, elems[5:8])))
                    R = qvec2rotmat(-qvec)
                    t = tvec.reshape([3, 1])
                    m = np.concatenate([np.concatenate([R, t], 1), bottom], 0)
                    c2w = np.linalg.inv(m)


                    extrinsics.append(c2w)

        
        return intrinsic, extrinsics


def preprocess_data(args, intrinsic, extrinsics):

    image_size = 384
    trans_totensor = transforms.Compose(
        [
            transforms.CenterCrop(1080),
            transforms.Resize(image_size, interpolation=PIL.Image.BILINEAR),
        ]
    )

    out_path_prefix = args.out_path_prefix
    data_root = args.data_root


    for scene, out_name in zip(scenes, out_names):
        out_path = os.path.join(out_path_prefix, out_name)
        os.makedirs(out_path, exist_ok=True)
        print(out_path)

        folders = ["image", "mask", "depth"]
        for folder in folders:
            out_folder = os.path.join(out_path, folder)
            os.makedirs(out_folder, exist_ok=True)

        # load color
        color_path = os.path.join(data_root, scene, 'images')
        color_paths = sorted(glob.glob(os.path.join(color_path, '*.png')),
                            key=lambda x: int(os.path.basename(x)[:-4]))

        # load intrinsic
        camera_intrinsic = intrinsic
        print(camera_intrinsic)

        # load pose
        poses = extrinsics
        poses = np.stack(poses)
        print(poses.shape, len(color_paths))

        # deal with invalid poses
        valid_poses = np.isfinite(poses).all(axis=2).all(axis=1)
        min_vertices = poses[:, :3, 3][valid_poses].min(axis=0)
        max_vertices = poses[:, :3, 3][valid_poses].max(axis=0)

        center = (min_vertices + max_vertices) / 2.
        scale = 2. / (np.max(max_vertices - min_vertices) + 3.)
        print(center, scale)

        # we should normalized to unit cube
        scale_mat = np.eye(4).astype(np.float32)
        scale_mat[:3, 3] = -center
        scale_mat[:3] *= scale
        scale_mat = np.linalg.inv(scale_mat)

        # copy image
        cameras = {}
        pcds = []
        H, W = 1080, 1920
        # center crop by 720
        offset_x = (W - 1080) * 0.5
        offset_y = (H - 1080) * 0.5
        camera_intrinsic[0, 2] -= offset_x
        camera_intrinsic[1, 2] -= offset_y
        # resize
        resize_factor = 384 / 1080
        camera_intrinsic[:2, :] *= resize_factor

        K = np.eye(4)
        K[:3, :3] = camera_intrinsic
        print(K)

        out_index = 0
        for idx, (valid, pose, image_path) in enumerate(zip(valid_poses, poses, color_paths)):
            if not valid:
                continue

            target_image = os.path.join(out_path, "image/%06d.png" % (out_index))

            img = Image.open(image_path)
            img_tensor = trans_totensor(img)
            img_tensor.save(target_image)

            mask = (np.ones((image_size, image_size, 3)) * 255.).astype(np.uint8)

            target_image = os.path.join(out_path, "mask/%03d.png" % (out_index))
            cv2.imwrite(target_image, mask)

            # save pose
            pcds.append(pose[:3, 3])
            pose = K @ np.linalg.inv(pose)

            # cameras["scale_mat_%d"%(out_index)] = np.eye(4).astype(np.float32)
            cameras["scale_mat_%d" % (out_index)] = scale_mat
            cameras["world_mat_%d" % (out_index)] = pose

            out_index += 1
        np.savez(os.path.join(out_path,"cameras.npz"), **cameras)


if __name__ == '__main__':

    intrinsic, extrinsics = get_pose_colmap(args)
    preprocess_data(args, intrinsic, extrinsics)

mesh is like this

thank you very much

May 23 '23 06:05 LLLLLLLianHua

Hi, can you maybe check the camera poses normalisations since you are using

center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

which are defined based on scannet dataset with depth sensors but you are using poses from colmap.

May 23 '23 07:05 niujinshuchong

Thank you for your reply

I have a question about

scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

the numbers 2 and 3 represent what ? when I using colmap how should I change them to adapt to the monosdf

May 23 '23 10:05 LLLLLLLianHua

2 stands for -1 to 1 and +3 because we assume each camera sees 1.5 meters forward in scannet.

May 23 '23 16:05 niujinshuchong

Thank you for your patience!

center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

center and scale is The pose of colmap is not recovered by scale （I use Arkit pose and result is good）, do I need to do scale restoration？if not , How do I set the parameters to normalize camera pose

Thank you very much

May 24 '23 07:05 LLLLLLLianHua

The goal is to make the scene to lies in the range [-1, 1].

May 24 '23 07:05 niujinshuchong

Thank you for your reply

May 26 '23 03:05 LLLLLLLianHua

Thank you for your reply

Hi!How did you finally solve it? I met the same problem when I use the following code:

 center = (min_vertices + max_vertices) / 2.
 scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

Jun 28 '23 04:06 thucz

@niujinshuchong I'm really sorry, but I still don't understand how to solve this problem. May I know how to specifically solve this problem?

Jun 30 '23 07:06 thucz

2 stands for -1 to 1 and +3 because we assume each camera sees 1.5 meters forward in scannet.

maybe that's the case for indoor rooms that are a ±square? what if the indoor dataset is a long hallway with low ceiling? then the normalization doesn't make much sense no? XY positions will be normalized between -1 and 1 but the Z values will all be close to 0. I think the solution is to use 2 scales. one for horizontal positions and one for vertical positions

Jul 16 '25 08:07 Golbstein