monosdf icon indicating copy to clipboard operation
monosdf copied to clipboard

realworld reconstruction using colmap

Open LLLLLLLianHua opened this issue 2 years ago • 11 comments

Hi, thanks for the wonferful work!

I want to reconstruct the real indoor scene and use colmap to recover the pose, but the reconstruction results are very poor,how can i solve it?

Thank you very much!

LLLLLLLianHua avatar May 22 '23 09:05 LLLLLLLianHua

Hi, I guess more details are needed for me to understand you problem.

niujinshuchong avatar May 22 '23 16:05 niujinshuchong

Much appreciated your reply! I wanted to reconstruct a real indoor scene, about 15 square meters, use colmap get image pose,and use following code to process data

import numpy as np
import cv2
import torch
import os
from scipy.spatial.transform import Slerp
from scipy.interpolate import interp1d
from scipy.spatial.transform import Rotation as R
import json
import trimesh
import argparse
import glob
import math
import PIL
import ipdb
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
from torch.nn import functional as F


scenes = ['lab_huang']
out_names = ['scan1']


def qvec2rotmat(qvec):
	return np.array([
		[
			1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
			2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
			2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
		], [
			2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
			1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
			2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
		], [
			2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
			2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
			1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
		]
	])

def get_pose_colmap(args):
    print("get image extrinsic and intrinsic data from colmap results")

    for scene, out_name in zip(scenes, out_names):

        TEXT_FOLDER = os.path.join(args.data_root, scene, 'sparse', '0')
        with open(os.path.join(TEXT_FOLDER, "cameras.txt"), "r") as f:
            angle_x = math.pi / 2
            for line in f:
                # 1 SIMPLE_RADIAL 2048 1536 1580.46 1024 768 0.0045691
                # 1 OPENCV 3840 2160 3178.27 3182.09 1920 1080 0.159668 -0.231286 -0.00123982 0.00272224
                # 1 RADIAL 1920 1080 1665.1 960 540 0.0672856 -0.0761443
                if line[0] == "#":
                    continue
                els = line.split(" ")
                w = float(els[2])
                h = float(els[3])
                fl_x = float(els[4])
                fl_y = float(els[4])
                k1 = 0
                k2 = 0
                p1 = 0
                p2 = 0
                cx = w / 2
                cy = h / 2
                if els[1] == "SIMPLE_PINHOLE":
                    cx = float(els[5])
                    cy = float(els[6])
                elif els[1] == "PINHOLE":
                    fl_y = float(els[5])
                    cx = float(els[6])
                    cy = float(els[7])
                elif els[1] == "SIMPLE_RADIAL":
                    cx = float(els[5])
                    cy = float(els[6])
                    k1 = float(els[7])
                elif els[1] == "RADIAL":
                    cx = float(els[5])
                    cy = float(els[6])
                    k1 = float(els[7])
                    k2 = float(els[8])
                elif els[1] == "OPENCV":
                    fl_y = float(els[5])
                    cx = float(els[6])
                    cy = float(els[7])
                    k1 = float(els[8])
                    k2 = float(els[9])
                    p1 = float(els[10])
                    p2 = float(els[11])
                else:
                    print("unknown camera model ", els[1])
                angle_x = math.atan(w / (fl_x * 2)) * 2
                angle_y = math.atan(h / (fl_y * 2)) * 2
                fovx = angle_x * 180 / math.pi
                fovy = angle_y * 180 / math.pi

        print(f"camera:\n\tres={w,h}\n\tcenter={cx,cy}\n\tfocal={fl_x,fl_y}\n\tfov={fovx,fovy}\n\tk={k1,k2} p={p1,p2} ")
        intrinsic = np.array([[fl_x, 0, cx],
                            [0, fl_y, cy],
                            [0, 0, 1]])

        print(intrinsic)

        with open(os.path.join(TEXT_FOLDER, "images.txt"), "r") as f:
            i = 0

            bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4])

            extrinsics = []

            for line in f:
                line = line.strip()

                if line[0] == "#":
                    continue

                i = i + 1

                if i % 2 == 1:
                    elems = line.split(" ")  # 1-4 is quat,5-7 is trans, 9ff is filename (9, if filename contains no spaces)

                    name = '_'.join(elems[9:])
                    full_name = os.path.join(args.images, name)

                    qvec = np.array(tuple(map(float, elems[1:5])))
                    tvec = np.array(tuple(map(float, elems[5:8])))
                    R = qvec2rotmat(-qvec)
                    t = tvec.reshape([3, 1])
                    m = np.concatenate([np.concatenate([R, t], 1), bottom], 0)
                    c2w = np.linalg.inv(m)


                    extrinsics.append(c2w)

        
        return intrinsic, extrinsics


def preprocess_data(args, intrinsic, extrinsics):

    image_size = 384
    trans_totensor = transforms.Compose(
        [
            transforms.CenterCrop(1080),
            transforms.Resize(image_size, interpolation=PIL.Image.BILINEAR),
        ]
    )

    out_path_prefix = args.out_path_prefix
    data_root = args.data_root


    for scene, out_name in zip(scenes, out_names):
        out_path = os.path.join(out_path_prefix, out_name)
        os.makedirs(out_path, exist_ok=True)
        print(out_path)

        folders = ["image", "mask", "depth"]
        for folder in folders:
            out_folder = os.path.join(out_path, folder)
            os.makedirs(out_folder, exist_ok=True)

        # load color
        color_path = os.path.join(data_root, scene, 'images')
        color_paths = sorted(glob.glob(os.path.join(color_path, '*.png')),
                            key=lambda x: int(os.path.basename(x)[:-4]))

        # load intrinsic
        camera_intrinsic = intrinsic
        print(camera_intrinsic)

        # load pose
        poses = extrinsics
        poses = np.stack(poses)
        print(poses.shape, len(color_paths))

        # deal with invalid poses
        valid_poses = np.isfinite(poses).all(axis=2).all(axis=1)
        min_vertices = poses[:, :3, 3][valid_poses].min(axis=0)
        max_vertices = poses[:, :3, 3][valid_poses].max(axis=0)

        center = (min_vertices + max_vertices) / 2.
        scale = 2. / (np.max(max_vertices - min_vertices) + 3.)
        print(center, scale)

        # we should normalized to unit cube
        scale_mat = np.eye(4).astype(np.float32)
        scale_mat[:3, 3] = -center
        scale_mat[:3] *= scale
        scale_mat = np.linalg.inv(scale_mat)

        # copy image
        cameras = {}
        pcds = []
        H, W = 1080, 1920
        # center crop by 720
        offset_x = (W - 1080) * 0.5
        offset_y = (H - 1080) * 0.5
        camera_intrinsic[0, 2] -= offset_x
        camera_intrinsic[1, 2] -= offset_y
        # resize
        resize_factor = 384 / 1080
        camera_intrinsic[:2, :] *= resize_factor

        K = np.eye(4)
        K[:3, :3] = camera_intrinsic
        print(K)

        out_index = 0
        for idx, (valid, pose, image_path) in enumerate(zip(valid_poses, poses, color_paths)):
            if not valid:
                continue

            target_image = os.path.join(out_path, "image/%06d.png" % (out_index))

            img = Image.open(image_path)
            img_tensor = trans_totensor(img)
            img_tensor.save(target_image)

            mask = (np.ones((image_size, image_size, 3)) * 255.).astype(np.uint8)

            target_image = os.path.join(out_path, "mask/%03d.png" % (out_index))
            cv2.imwrite(target_image, mask)

            # save pose
            pcds.append(pose[:3, 3])
            pose = K @ np.linalg.inv(pose)

            # cameras["scale_mat_%d"%(out_index)] = np.eye(4).astype(np.float32)
            cameras["scale_mat_%d" % (out_index)] = scale_mat
            cameras["world_mat_%d" % (out_index)] = pose

            out_index += 1
        np.savez(os.path.join(out_path,"cameras.npz"), **cameras)


if __name__ == '__main__':

    intrinsic, extrinsics = get_pose_colmap(args)
    preprocess_data(args, intrinsic, extrinsics)

mesh is like this image

thank you very much

LLLLLLLianHua avatar May 23 '23 06:05 LLLLLLLianHua

Hi, can you maybe check the camera poses normalisations since you are using

center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

which are defined based on scannet dataset with depth sensors but you are using poses from colmap.

niujinshuchong avatar May 23 '23 07:05 niujinshuchong

Thank you for your reply

I have a question about

scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

the numbers 2 and 3 represent what ? when I using colmap how should I change them to adapt to the monosdf

LLLLLLLianHua avatar May 23 '23 10:05 LLLLLLLianHua

2 stands for -1 to 1 and +3 because we assume each camera sees 1.5 meters forward in scannet.

niujinshuchong avatar May 23 '23 16:05 niujinshuchong

Thank you for your patience!

center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

center and scale is image The pose of colmap is not recovered by scale (I use Arkit pose and result is good), do I need to do scale restoration?if not , How do I set the parameters to normalize camera pose

Thank you very much

LLLLLLLianHua avatar May 24 '23 07:05 LLLLLLLianHua

The goal is to make the scene to lies in the range [-1, 1].

niujinshuchong avatar May 24 '23 07:05 niujinshuchong

Thank you for your reply

LLLLLLLianHua avatar May 26 '23 03:05 LLLLLLLianHua

Thank you for your reply

Hi!How did you finally solve it? I met the same problem when I use the following code:

 center = (min_vertices + max_vertices) / 2.
 scale = 2. / (np.max(max_vertices - min_vertices) + 3.)

thucz avatar Jun 28 '23 04:06 thucz

@niujinshuchong I'm really sorry, but I still don't understand how to solve this problem. May I know how to specifically solve this problem?

thucz avatar Jun 30 '23 07:06 thucz

2 stands for -1 to 1 and +3 because we assume each camera sees 1.5 meters forward in scannet.

maybe that's the case for indoor rooms that are a ±square? what if the indoor dataset is a long hallway with low ceiling? then the normalization doesn't make much sense no? XY positions will be normalized between -1 and 1 but the Z values will all be close to 0. I think the solution is to use 2 scales. one for horizontal positions and one for vertical positions

Golbstein avatar Jul 16 '25 08:07 Golbstein