realworld reconstruction using colmap
Hi, thanks for the wonferful work!
I want to reconstruct the real indoor scene and use colmap to recover the pose, but the reconstruction results are very poor,how can i solve it?
Thank you very much!
Hi, I guess more details are needed for me to understand you problem.
Much appreciated your reply! I wanted to reconstruct a real indoor scene, about 15 square meters, use colmap get image pose,and use following code to process data
import numpy as np
import cv2
import torch
import os
from scipy.spatial.transform import Slerp
from scipy.interpolate import interp1d
from scipy.spatial.transform import Rotation as R
import json
import trimesh
import argparse
import glob
import math
import PIL
import ipdb
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
from torch.nn import functional as F
scenes = ['lab_huang']
out_names = ['scan1']
def qvec2rotmat(qvec):
return np.array([
[
1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
], [
2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
], [
2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
]
])
def get_pose_colmap(args):
print("get image extrinsic and intrinsic data from colmap results")
for scene, out_name in zip(scenes, out_names):
TEXT_FOLDER = os.path.join(args.data_root, scene, 'sparse', '0')
with open(os.path.join(TEXT_FOLDER, "cameras.txt"), "r") as f:
angle_x = math.pi / 2
for line in f:
# 1 SIMPLE_RADIAL 2048 1536 1580.46 1024 768 0.0045691
# 1 OPENCV 3840 2160 3178.27 3182.09 1920 1080 0.159668 -0.231286 -0.00123982 0.00272224
# 1 RADIAL 1920 1080 1665.1 960 540 0.0672856 -0.0761443
if line[0] == "#":
continue
els = line.split(" ")
w = float(els[2])
h = float(els[3])
fl_x = float(els[4])
fl_y = float(els[4])
k1 = 0
k2 = 0
p1 = 0
p2 = 0
cx = w / 2
cy = h / 2
if els[1] == "SIMPLE_PINHOLE":
cx = float(els[5])
cy = float(els[6])
elif els[1] == "PINHOLE":
fl_y = float(els[5])
cx = float(els[6])
cy = float(els[7])
elif els[1] == "SIMPLE_RADIAL":
cx = float(els[5])
cy = float(els[6])
k1 = float(els[7])
elif els[1] == "RADIAL":
cx = float(els[5])
cy = float(els[6])
k1 = float(els[7])
k2 = float(els[8])
elif els[1] == "OPENCV":
fl_y = float(els[5])
cx = float(els[6])
cy = float(els[7])
k1 = float(els[8])
k2 = float(els[9])
p1 = float(els[10])
p2 = float(els[11])
else:
print("unknown camera model ", els[1])
angle_x = math.atan(w / (fl_x * 2)) * 2
angle_y = math.atan(h / (fl_y * 2)) * 2
fovx = angle_x * 180 / math.pi
fovy = angle_y * 180 / math.pi
print(f"camera:\n\tres={w,h}\n\tcenter={cx,cy}\n\tfocal={fl_x,fl_y}\n\tfov={fovx,fovy}\n\tk={k1,k2} p={p1,p2} ")
intrinsic = np.array([[fl_x, 0, cx],
[0, fl_y, cy],
[0, 0, 1]])
print(intrinsic)
with open(os.path.join(TEXT_FOLDER, "images.txt"), "r") as f:
i = 0
bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4])
extrinsics = []
for line in f:
line = line.strip()
if line[0] == "#":
continue
i = i + 1
if i % 2 == 1:
elems = line.split(" ") # 1-4 is quat,5-7 is trans, 9ff is filename (9, if filename contains no spaces)
name = '_'.join(elems[9:])
full_name = os.path.join(args.images, name)
qvec = np.array(tuple(map(float, elems[1:5])))
tvec = np.array(tuple(map(float, elems[5:8])))
R = qvec2rotmat(-qvec)
t = tvec.reshape([3, 1])
m = np.concatenate([np.concatenate([R, t], 1), bottom], 0)
c2w = np.linalg.inv(m)
extrinsics.append(c2w)
return intrinsic, extrinsics
def preprocess_data(args, intrinsic, extrinsics):
image_size = 384
trans_totensor = transforms.Compose(
[
transforms.CenterCrop(1080),
transforms.Resize(image_size, interpolation=PIL.Image.BILINEAR),
]
)
out_path_prefix = args.out_path_prefix
data_root = args.data_root
for scene, out_name in zip(scenes, out_names):
out_path = os.path.join(out_path_prefix, out_name)
os.makedirs(out_path, exist_ok=True)
print(out_path)
folders = ["image", "mask", "depth"]
for folder in folders:
out_folder = os.path.join(out_path, folder)
os.makedirs(out_folder, exist_ok=True)
# load color
color_path = os.path.join(data_root, scene, 'images')
color_paths = sorted(glob.glob(os.path.join(color_path, '*.png')),
key=lambda x: int(os.path.basename(x)[:-4]))
# load intrinsic
camera_intrinsic = intrinsic
print(camera_intrinsic)
# load pose
poses = extrinsics
poses = np.stack(poses)
print(poses.shape, len(color_paths))
# deal with invalid poses
valid_poses = np.isfinite(poses).all(axis=2).all(axis=1)
min_vertices = poses[:, :3, 3][valid_poses].min(axis=0)
max_vertices = poses[:, :3, 3][valid_poses].max(axis=0)
center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)
print(center, scale)
# we should normalized to unit cube
scale_mat = np.eye(4).astype(np.float32)
scale_mat[:3, 3] = -center
scale_mat[:3] *= scale
scale_mat = np.linalg.inv(scale_mat)
# copy image
cameras = {}
pcds = []
H, W = 1080, 1920
# center crop by 720
offset_x = (W - 1080) * 0.5
offset_y = (H - 1080) * 0.5
camera_intrinsic[0, 2] -= offset_x
camera_intrinsic[1, 2] -= offset_y
# resize
resize_factor = 384 / 1080
camera_intrinsic[:2, :] *= resize_factor
K = np.eye(4)
K[:3, :3] = camera_intrinsic
print(K)
out_index = 0
for idx, (valid, pose, image_path) in enumerate(zip(valid_poses, poses, color_paths)):
if not valid:
continue
target_image = os.path.join(out_path, "image/%06d.png" % (out_index))
img = Image.open(image_path)
img_tensor = trans_totensor(img)
img_tensor.save(target_image)
mask = (np.ones((image_size, image_size, 3)) * 255.).astype(np.uint8)
target_image = os.path.join(out_path, "mask/%03d.png" % (out_index))
cv2.imwrite(target_image, mask)
# save pose
pcds.append(pose[:3, 3])
pose = K @ np.linalg.inv(pose)
# cameras["scale_mat_%d"%(out_index)] = np.eye(4).astype(np.float32)
cameras["scale_mat_%d" % (out_index)] = scale_mat
cameras["world_mat_%d" % (out_index)] = pose
out_index += 1
np.savez(os.path.join(out_path,"cameras.npz"), **cameras)
if __name__ == '__main__':
intrinsic, extrinsics = get_pose_colmap(args)
preprocess_data(args, intrinsic, extrinsics)
mesh is like this
thank you very much
Hi, can you maybe check the camera poses normalisations since you are using
center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)
which are defined based on scannet dataset with depth sensors but you are using poses from colmap.
Thank you for your reply
I have a question about
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)
the numbers 2 and 3 represent what ? when I using colmap how should I change them to adapt to the monosdf
2 stands for -1 to 1 and +3 because we assume each camera sees 1.5 meters forward in scannet.
Thank you for your patience!
center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)
center and scale is
The pose of colmap is not recovered by scale (I use Arkit pose and result is good), do I need to do scale restoration?if not , How do I set the parameters to normalize camera pose
Thank you very much
The goal is to make the scene to lies in the range [-1, 1].
Thank you for your reply
Thank you for your reply
Hi!How did you finally solve it? I met the same problem when I use the following code:
center = (min_vertices + max_vertices) / 2.
scale = 2. / (np.max(max_vertices - min_vertices) + 3.)
@niujinshuchong I'm really sorry, but I still don't understand how to solve this problem. May I know how to specifically solve this problem?
2 stands for -1 to 1 and +3 because we assume each camera sees 1.5 meters forward in scannet.
maybe that's the case for indoor rooms that are a ±square? what if the indoor dataset is a long hallway with low ceiling? then the normalization doesn't make much sense no? XY positions will be normalized between -1 and 1 but the Z values will all be close to 0. I think the solution is to use 2 scales. one for horizontal positions and one for vertical positions