Hi,bolian. I used the pretrained pose_net to evaluate trajectory of KITTI Odom 09 ，but the result is bad. Does the pose_net output relative pose of two images?

Jun 16 '22 15:06 wanglong1008

Hi @wanglong1008

Thanks for the question. Yes, the pose_net is supposed to output relative pose of two images. Would you please share how you evaluate the pose? (sample code or reference repo)

Jun 17 '22 16:06 bolianchen

First I convert the relative pose to absolute pose like this.

global_pose = np.eye(4)
poses = []
f = open(os.path.join(base, i), "r")  
lines = f.readlines()
f.close()
for line in lines:
    pose = [float(a) for a in line.split(" ")]
    pose = torch.tensor(pose)
    pose = torch.unsqueeze(pose, 0)
    pose_mat = pose_vec2mat(pose).squeeze(0).cpu().numpy()
    pose_mat = np.vstack([pose_mat, np.array([0, 0, 0, 1])])
    global_pose = global_pose @ np.linalg.inv(pose_mat)
    poses.append(global_pose[0:3, :].reshape(1, 12))
poses = np.concatenate(poses, axis=0)
filename = Path(base + "_" + str(i))
np.savetxt(filename, poses, delimiter=' ', fmt='%1.8e')

And then I evaluate the pose with the code from https://github.com/JiawangBian/SC-SfMLearner-Release/tree/master/kitti_eval.

Jun 20 '22 05:06 wanglong1008

Hi @wanglong1008 Sincerely apologize for my late response. I marked this issue as help wanted as a future check item.

Jul 25 '22 02:07 bolianchen

Hi @bolianchen @wanglong1008 , I want to estimate camera intrinsics parameters using the intrinsic head network. I tested the following script to infer the intrinsic parameters between two images (for example from 2011_09_26_drive_0001_sync_02 sequence) . But results are very far from ground thruth kitti parameters. Do you have any suggestions?

python infer_intrinsic.py --input_path /workspace/generated_data/generated_raw_data_KITTI_128_416/2011_09_26_drive_0001_sync_02 --model_path models/kitti_pretrained/models/best

infer_intrinsic.py: test

import os import numpy as np import cv2 import matplotlib.pyplot as plt

from lib.data_iterators import ImageReader, VideoReader from lib.img_processing import ImageProcessor, concat_depth_img from lib.utils import get_model_opt

from evaluators import EVALUATORS from options import InferOptions import glob import ipdb

class Intrinsic_Inference: def init(self, args): self.args = args self._init_evaluator()

def _init_evaluator(self):
    model_opt = get_model_opt(self.args.model_path)
    self.evaluator = EVALUATORS[model_opt.method](model_opt)

def infer(self):
    if not os.path.exists(self.args.output_dir):
        os.makedirs(self.args.output_dir)


    if self.args.input_path.endswith('mp4'):
        # Process video
        cap = cv2.VideoCapture(self.args.input_path)
        ret,last_frame=cap.read()
        while(cap.isOpened()):
            ret, current_frame = cap.read()
            K=self.evaluator.estimate_K(last_frame, current_frame)
            print(K)
            if current_frame is None:
                break

            last_frame=current_frame
    else :
        # Process images:
        cam_config=glob.glob(self.args.input_path+'/*.txt')[0]
        cam_config = open(cam_config, "r").read().split(',')
        cam_config=[float(x) for x in cam_config]

        gt_cam_matrix=np.asarray([[cam_config[0],cam_config[1],cam_config[2]], [cam_config[3],cam_config[4],cam_config[5]], [cam_config[6],cam_config[7],cam_config[8]]]) 
        
        list_images=sorted(glob.glob(self.args.input_path+'/*.png'))
        list_images=[x for x in list_images if "-fseg" not in x]
        last_frame=cv2.imread(list_images[0])
        for img in list_images:
            current_frame=cv2.imread(img)
            K=self.evaluator.estimate_K(last_frame, current_frame).squeeze(0)
            K=K.cpu().detach().numpy()

            K= np.delete(K, 3, 0)
            K=np.delete(K, 3, 1)
            print(K)
            last_frame=current_frame
        print("gt_cam_matrix:")
        print(gt_cam_matrix)

if name == 'main': K_estimator = Intrinsic_Inference(InferOptions().parse()[0]) K_estimator.infer()

MODIFIED wild_evaluator.py

import os import numpy as np import torch from torchvision import transforms

from trainers import WildTrainer from options import WildOptions

from .base_evaluator import BaseEvaluator import ipdb

class WildEvaluator(BaseEvaluator): def init(self, opt): self.opt = opt if not hasattr(self, 'num_pose_frames'): self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else len(self.opt.frame_ids) self.opt.models_to_load = ['encoder', 'depth', 'pose', 'motion','intrinsics_head'] super().init(self.opt)

def _init_depth_net(self):
    WildTrainer._init_depth_net(self)

def _init_pose_net(self):
    WildTrainer._init_pose_net(self)

def _load_models(self):
    WildTrainer.load_model(self)

def estimate_depth(self, img):
    with torch.no_grad():
        img = transforms.ToTensor()(np.array(img)).unsqueeze(0)
        features = self.models['encoder'](img.to(self.device))
        outputs = self.models['depth'](features)

    depth = outputs[("depth", 0)]
    disp = 1/depth
    disp_colormap = self._color_disp(disp)

    return disp_colormap, depth


def estimate_K(self, source_frame, target_frame):
    """Returns pose prediction for a single pair of frames

    estimate the pose from the source frame to the target frame
        by feeding [target image, source image]
    """
    height,width=source_frame.shape[:2]
    source_frame = transforms.ToTensor()(np.array(source_frame)).unsqueeze(0).to(self.device)
    target_frame = transforms.ToTensor()(np.array(target_frame)).unsqueeze(0).to(self.device)
    features = [source_frame,target_frame]

    pose_inputs = torch.cat(features, 1)
   
    with torch.no_grad():
        # axisangle and translation from target to source
        axisangle, translation, bottleneck, features = (
                self.models["pose"](pose_inputs)
                )
        K = self.models['intrinsics_head'](bottleneck, width, height)
    return K

Aug 02 '22 15:08 KOuldamer

Hi @bolianchen @wanglong1008 , I want to estimate camera intrinsics parameters using the intrinsic head network. I tested the following script to infer the intrinsic parameters between two images (for example from 2011_09_26_drive_0001_sync_02 sequence) . But results are very far from ground thruth kitti parameters. Do you have any suggestions?

python infer_intrinsic.py --input_path /workspace/generated_data/generated_raw_data_KITTI_128_416/2011_09_26_drive_0001_sync_02 --model_path models/kitti_pretrained/models/best

infer_intrinsic.py:

import os import numpy as np import cv2 import matplotlib.pyplot as plt

from lib.data_iterators import ImageReader, VideoReader from lib.img_processing import ImageProcessor, concat_depth_img from lib.utils import get_model_opt

from evaluators import EVALUATORS from options import InferOptions import glob import ipdb

class Intrinsic_Inference: def init(self, args): self.args = args self._init_evaluator()
def _init_evaluator(self):
    model_opt = get_model_opt(self.args.model_path)
    self.evaluator = EVALUATORS[model_opt.method](model_opt)

def infer(self):
    if not os.path.exists(self.args.output_dir):
        os.makedirs(self.args.output_dir)


    if self.args.input_path.endswith('mp4'):
        # Process video
        cap = cv2.VideoCapture(self.args.input_path)
        ret,last_frame=cap.read()
        while(cap.isOpened()):
            ret, current_frame = cap.read()
            K=self.evaluator.estimate_K(last_frame, current_frame)
            print(K)
            if current_frame is None:
                break

            last_frame=current_frame
    else :
        # Process images:
        cam_config=glob.glob(self.args.input_path+'/*.txt')[0]
        cam_config = open(cam_config, "r").read().split(',')
        cam_config=[float(x) for x in cam_config]

        gt_cam_matrix=np.asarray([[cam_config[0],cam_config[1],cam_config[2]], [cam_config[3],cam_config[4],cam_config[5]], [cam_config[6],cam_config[7],cam_config[8]]]) 
        
        list_images=sorted(glob.glob(self.args.input_path+'/*.png'))
        list_images=[x for x in list_images if "-fseg" not in x]
        last_frame=cv2.imread(list_images[0])
        for img in list_images:
            current_frame=cv2.imread(img)
            K=self.evaluator.estimate_K(last_frame, current_frame).squeeze(0)
            K=K.cpu().detach().numpy()

            K= np.delete(K, 3, 0)
            K=np.delete(K, 3, 1)
            print(K)
            last_frame=current_frame
        print("gt_cam_matrix:")
        print(gt_cam_matrix)
if name == 'main': K_estimator = Intrinsic_Inference(InferOptions().parse()[0]) K_estimator.infer()

MODIFIED wild_evaluator.py

import os import numpy as np import torch from torchvision import transforms

from trainers import WildTrainer from options import WildOptions

from .base_evaluator import BaseEvaluator import ipdb

class WildEvaluator(BaseEvaluator): def init(self, opt): self.opt = opt if not hasattr(self, 'num_pose_frames'): self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else len(self.opt.frame_ids) self.opt.models_to_load = ['encoder', 'depth', 'pose', 'motion','intrinsics_head'] super().init(self.opt)
def _init_depth_net(self):
    WildTrainer._init_depth_net(self)

def _init_pose_net(self):
    WildTrainer._init_pose_net(self)

def _load_models(self):
    WildTrainer.load_model(self)

def estimate_depth(self, img):
    with torch.no_grad():
        img = transforms.ToTensor()(np.array(img)).unsqueeze(0)
        features = self.models['encoder'](img.to(self.device))
        outputs = self.models['depth'](features)

    depth = outputs[("depth", 0)]
    disp = 1/depth
    disp_colormap = self._color_disp(disp)

    return disp_colormap, depth


def estimate_K(self, source_frame, target_frame):
    """Returns pose prediction for a single pair of frames

    estimate the pose from the source frame to the target frame
        by feeding [target image, source image]
    """
    height,width=source_frame.shape[:2]
    source_frame = transforms.ToTensor()(np.array(source_frame)).unsqueeze(0).to(self.device)
    target_frame = transforms.ToTensor()(np.array(target_frame)).unsqueeze(0).to(self.device)
    features = [source_frame,target_frame]

    pose_inputs = torch.cat(features, 1)
   
    with torch.no_grad():
        # axisangle and translation from target to source
        axisangle, translation, bottleneck, features = (
                self.models["pose"](pose_inputs)
                )
        K = self.models['intrinsics_head'](bottleneck, width, height)
    return K

Hi did you write these code by yourself?

Jan 18 '24 13:01 Howie11112

pytorch_depth_from_videos_in_the_wild
pytorch_depth_from_videos_in_the_wild copied to clipboard

evaluate pose

MODIFIED wild_evaluator.py

MODIFIED wild_evaluator.py

pytorch_depth_from_videos_in_the_wild pytorch_depth_from_videos_in_the_wild copied to clipboard

evaluate pose

MODIFIED wild_evaluator.py

MODIFIED wild_evaluator.py

pytorch_depth_from_videos_in_the_wild
pytorch_depth_from_videos_in_the_wild copied to clipboard