pytorch_depth_from_videos_in_the_wild
pytorch_depth_from_videos_in_the_wild copied to clipboard
evaluate pose
Hi,bolian. I used the pretrained pose_net to evaluate trajectory of KITTI Odom 09 ,but the result is bad. Does the pose_net output relative pose of two images?
Hi @wanglong1008
Thanks for the question. Yes, the pose_net is supposed to output relative pose of two images. Would you please share how you evaluate the pose? (sample code or reference repo)
First I convert the relative pose to absolute pose like this.
global_pose = np.eye(4)
poses = []
f = open(os.path.join(base, i), "r")
lines = f.readlines()
f.close()
for line in lines:
pose = [float(a) for a in line.split(" ")]
pose = torch.tensor(pose)
pose = torch.unsqueeze(pose, 0)
pose_mat = pose_vec2mat(pose).squeeze(0).cpu().numpy()
pose_mat = np.vstack([pose_mat, np.array([0, 0, 0, 1])])
global_pose = global_pose @ np.linalg.inv(pose_mat)
poses.append(global_pose[0:3, :].reshape(1, 12))
poses = np.concatenate(poses, axis=0)
filename = Path(base + "_" + str(i))
np.savetxt(filename, poses, delimiter=' ', fmt='%1.8e')
And then I evaluate the pose with the code from https://github.com/JiawangBian/SC-SfMLearner-Release/tree/master/kitti_eval.
Hi @wanglong1008 Sincerely apologize for my late response. I marked this issue as help wanted as a future check item.
Hi @bolianchen @wanglong1008 , I want to estimate camera intrinsics parameters using the intrinsic head network. I tested the following script to infer the intrinsic parameters between two images (for example from 2011_09_26_drive_0001_sync_02 sequence) . But results are very far from ground thruth kitti parameters. Do you have any suggestions?
python infer_intrinsic.py --input_path /workspace/generated_data/generated_raw_data_KITTI_128_416/2011_09_26_drive_0001_sync_02 --model_path models/kitti_pretrained/models/best
infer_intrinsic.py:
import os import numpy as np import cv2 import matplotlib.pyplot as plt
from lib.data_iterators import ImageReader, VideoReader from lib.img_processing import ImageProcessor, concat_depth_img from lib.utils import get_model_opt
from evaluators import EVALUATORS from options import InferOptions import glob import ipdb
class Intrinsic_Inference: def init(self, args): self.args = args self._init_evaluator()
def _init_evaluator(self):
model_opt = get_model_opt(self.args.model_path)
self.evaluator = EVALUATORS[model_opt.method](model_opt)
def infer(self):
if not os.path.exists(self.args.output_dir):
os.makedirs(self.args.output_dir)
if self.args.input_path.endswith('mp4'):
# Process video
cap = cv2.VideoCapture(self.args.input_path)
ret,last_frame=cap.read()
while(cap.isOpened()):
ret, current_frame = cap.read()
K=self.evaluator.estimate_K(last_frame, current_frame)
print(K)
if current_frame is None:
break
last_frame=current_frame
else :
# Process images:
cam_config=glob.glob(self.args.input_path+'/*.txt')[0]
cam_config = open(cam_config, "r").read().split(',')
cam_config=[float(x) for x in cam_config]
gt_cam_matrix=np.asarray([[cam_config[0],cam_config[1],cam_config[2]], [cam_config[3],cam_config[4],cam_config[5]], [cam_config[6],cam_config[7],cam_config[8]]])
list_images=sorted(glob.glob(self.args.input_path+'/*.png'))
list_images=[x for x in list_images if "-fseg" not in x]
last_frame=cv2.imread(list_images[0])
for img in list_images:
current_frame=cv2.imread(img)
K=self.evaluator.estimate_K(last_frame, current_frame).squeeze(0)
K=K.cpu().detach().numpy()
K= np.delete(K, 3, 0)
K=np.delete(K, 3, 1)
print(K)
last_frame=current_frame
print("gt_cam_matrix:")
print(gt_cam_matrix)
if name == 'main': K_estimator = Intrinsic_Inference(InferOptions().parse()[0]) K_estimator.infer()
MODIFIED wild_evaluator.py
import os import numpy as np import torch from torchvision import transforms
from trainers import WildTrainer from options import WildOptions
from .base_evaluator import BaseEvaluator import ipdb
class WildEvaluator(BaseEvaluator): def init(self, opt): self.opt = opt if not hasattr(self, 'num_pose_frames'): self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else len(self.opt.frame_ids) self.opt.models_to_load = ['encoder', 'depth', 'pose', 'motion','intrinsics_head'] super().init(self.opt)
def _init_depth_net(self):
WildTrainer._init_depth_net(self)
def _init_pose_net(self):
WildTrainer._init_pose_net(self)
def _load_models(self):
WildTrainer.load_model(self)
def estimate_depth(self, img):
with torch.no_grad():
img = transforms.ToTensor()(np.array(img)).unsqueeze(0)
features = self.models['encoder'](img.to(self.device))
outputs = self.models['depth'](features)
depth = outputs[("depth", 0)]
disp = 1/depth
disp_colormap = self._color_disp(disp)
return disp_colormap, depth
def estimate_K(self, source_frame, target_frame):
"""Returns pose prediction for a single pair of frames
estimate the pose from the source frame to the target frame
by feeding [target image, source image]
"""
height,width=source_frame.shape[:2]
source_frame = transforms.ToTensor()(np.array(source_frame)).unsqueeze(0).to(self.device)
target_frame = transforms.ToTensor()(np.array(target_frame)).unsqueeze(0).to(self.device)
features = [source_frame,target_frame]
pose_inputs = torch.cat(features, 1)
with torch.no_grad():
# axisangle and translation from target to source
axisangle, translation, bottleneck, features = (
self.models["pose"](pose_inputs)
)
K = self.models['intrinsics_head'](bottleneck, width, height)
return K
Hi @bolianchen @wanglong1008 , I want to estimate camera intrinsics parameters using the intrinsic head network. I tested the following script to infer the intrinsic parameters between two images (for example from 2011_09_26_drive_0001_sync_02 sequence) . But results are very far from ground thruth kitti parameters. Do you have any suggestions?
python infer_intrinsic.py --input_path /workspace/generated_data/generated_raw_data_KITTI_128_416/2011_09_26_drive_0001_sync_02 --model_path models/kitti_pretrained/models/best
infer_intrinsic.py:
import os import numpy as np import cv2 import matplotlib.pyplot as plt
from lib.data_iterators import ImageReader, VideoReader from lib.img_processing import ImageProcessor, concat_depth_img from lib.utils import get_model_opt
from evaluators import EVALUATORS from options import InferOptions import glob import ipdb
class Intrinsic_Inference: def init(self, args): self.args = args self._init_evaluator()
def _init_evaluator(self): model_opt = get_model_opt(self.args.model_path) self.evaluator = EVALUATORS[model_opt.method](model_opt) def infer(self): if not os.path.exists(self.args.output_dir): os.makedirs(self.args.output_dir) if self.args.input_path.endswith('mp4'): # Process video cap = cv2.VideoCapture(self.args.input_path) ret,last_frame=cap.read() while(cap.isOpened()): ret, current_frame = cap.read() K=self.evaluator.estimate_K(last_frame, current_frame) print(K) if current_frame is None: break last_frame=current_frame else : # Process images: cam_config=glob.glob(self.args.input_path+'/*.txt')[0] cam_config = open(cam_config, "r").read().split(',') cam_config=[float(x) for x in cam_config] gt_cam_matrix=np.asarray([[cam_config[0],cam_config[1],cam_config[2]], [cam_config[3],cam_config[4],cam_config[5]], [cam_config[6],cam_config[7],cam_config[8]]]) list_images=sorted(glob.glob(self.args.input_path+'/*.png')) list_images=[x for x in list_images if "-fseg" not in x] last_frame=cv2.imread(list_images[0]) for img in list_images: current_frame=cv2.imread(img) K=self.evaluator.estimate_K(last_frame, current_frame).squeeze(0) K=K.cpu().detach().numpy() K= np.delete(K, 3, 0) K=np.delete(K, 3, 1) print(K) last_frame=current_frame print("gt_cam_matrix:") print(gt_cam_matrix)
if name == 'main': K_estimator = Intrinsic_Inference(InferOptions().parse()[0]) K_estimator.infer()
MODIFIED wild_evaluator.py
import os import numpy as np import torch from torchvision import transforms
from trainers import WildTrainer from options import WildOptions
from .base_evaluator import BaseEvaluator import ipdb
class WildEvaluator(BaseEvaluator): def init(self, opt): self.opt = opt if not hasattr(self, 'num_pose_frames'): self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else len(self.opt.frame_ids) self.opt.models_to_load = ['encoder', 'depth', 'pose', 'motion','intrinsics_head'] super().init(self.opt)
def _init_depth_net(self): WildTrainer._init_depth_net(self) def _init_pose_net(self): WildTrainer._init_pose_net(self) def _load_models(self): WildTrainer.load_model(self) def estimate_depth(self, img): with torch.no_grad(): img = transforms.ToTensor()(np.array(img)).unsqueeze(0) features = self.models['encoder'](img.to(self.device)) outputs = self.models['depth'](features) depth = outputs[("depth", 0)] disp = 1/depth disp_colormap = self._color_disp(disp) return disp_colormap, depth def estimate_K(self, source_frame, target_frame): """Returns pose prediction for a single pair of frames estimate the pose from the source frame to the target frame by feeding [target image, source image] """ height,width=source_frame.shape[:2] source_frame = transforms.ToTensor()(np.array(source_frame)).unsqueeze(0).to(self.device) target_frame = transforms.ToTensor()(np.array(target_frame)).unsqueeze(0).to(self.device) features = [source_frame,target_frame] pose_inputs = torch.cat(features, 1) with torch.no_grad(): # axisangle and translation from target to source axisangle, translation, bottleneck, features = ( self.models["pose"](pose_inputs) ) K = self.models['intrinsics_head'](bottleneck, width, height) return K
Hi did you write these code by yourself?