pytorch3d
pytorch3d copied to clipboard
Working Example of Pytorch3d efficient_pnp() function
❓ Questions on how to use PyTorch3D
Hi, I'm trying to get a minimal version of the efficient_pnp algorithm to work using the pytorch3d library. Given my own custom rotation and translation, I want efficient_pnp to recover the rotation and translation based on the 3D and 2D points given by the camera. Any suggestions to make my current code to work?
import torch
import numpy as np
from pytorch3d.ops import efficient_pnp
from pytorch3d.renderer.cameras import look_at_view_transform
from pytorch3d.transforms import euler_angles_to_matrix
from pytorch3d.renderer import PerspectiveCameras
##########################################################################################
##########################################################################################
##########################################################################################
# create camera
#
# camera_pos [3] xyz
# img_size scalar image size
# focal_length scalar focal length
def make_camera(camera_pos,img_size,focal_length,DEVICE):
r,t = look_at_view_transform(
dist=camera_pos[2],
elev=0,
azim=0)
# create projection matrix according to pytorch3d convention. Under PerspectiveCameras
# https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/renderer/cameras.html
p_matrix = np.array([focal_length, 0.0, img_size // 2, 0.0,
0.0, focal_length, img_size // 2, 0.0,
0.0, 0.0, 0.0, 1.0,
0.0, 0.0, 1.0, 0.0],dtype=np.float32).reshape(4,4)
p_matrix = torch.tensor(p_matrix).float().unsqueeze(0)
cameras = PerspectiveCameras(device=DEVICE,
R=r,
T=t,
K = p_matrix,
in_ndc = False,
image_size=(img_size,img_size)
)
return cameras
# create intrinsic camera matrix from focal length and image width/height
def make_intrinsic_matrix(img_size,focal):
s = img_size // 2
p_matrix = np.array([focal, 0.0, s,
0.0, focal, s,
0.0, 0.0, 1.0],dtype=np.float32)
return torch.tensor(p_matrix.reshape(3,3))
###################################################################
if __name__ == '__main__':
# create camera parameters
IMG_SIZE = 256
CAMERA_POS = [0.0,0.0,-20.0]
FOCAL_LENGTH = 2000
DEVICE='cuda'
P_MAT = make_intrinsic_matrix(IMG_SIZE,FOCAL_LENGTH).to(DEVICE)
# create dummy points
q = torch.tensor(
[[0,0,0],
[0,0,1],
[0,1,0],
[1,0,0],
[0,0,-1],
[0,-1,0],
[-1,0,0]]).float().cuda().unsqueeze(0)
# create camera used for rendering
cam = make_camera(CAMERA_POS, IMG_SIZE, FOCAL_LENGTH, DEVICE)
# transform_points_screen(q)
# cam.transform_points(q) does not give same result as cam.transform_points_screen(q)
# a = cam.transform_points(q)
b = cam.transform_points_screen(q)
# for some reason it creates 2 instead of projecting q once
batch_size = b.shape[0]
if q.shape[0] == 1:
q = q.repeat(batch_size,1,1)
# uncalibrate 2D points
b[...,2] = 1
k_inv = torch.inverse(P_MAT.T)
b_cam = torch.matmul(b,k_inv.unsqueeze(0))
# b_cam = torch.matmul(b_cam,flip_matrix)
# since we don't rotate or translate the 3D shape, the rotation matrix is identity
# and translation vector should be [0,0,0]
# skipping quadratic equation seems to be more accurate?
transform = efficient_pnp(q, b_cam[...,:2])
print(transform.R)
print(transform.T)
# rotation seems to indicate flipping of x, y axis
euler_angles = torch.tensor([np.pi / 4.0, np.pi/4.0, np.pi / 2.0]).float().unsqueeze(0).cuda()
my_R = euler_angles_to_matrix(euler_angles,'XYZ')
my_T = torch.tensor([0.0,0.2,0.5]).float().unsqueeze(0).cuda()
q_rt = torch.matmul(q,my_R) + my_T.unsqueeze(1)
b_cam = cam.transform_points_screen(q_rt)
b_cam[...,2] = 1
b_cam = torch.matmul(b_cam,k_inv.unsqueeze(0))
# check to see if we can get back the same rotation
transform = efficient_pnp(q_rt, b_cam[...,:2])
print(transform.R)
print(transform.T)
print("my R: ", my_R)
print("my T: ", my_T)
After some digging, I found a way to get this to work, and compared it with opencv version. To use this for perspective cameras, a negative focal length is applied due to the flipping from pytorch3d's axis convention to image space convention. Since I define my camera in screen space (aka image space), a negative focal length is used to project from world to screen space in pytorch3d's implementation of PerspectiveCameras.transform_points_screen(). Personally I think this should be stated in either the efficient_pnp() function docs or the PerspectiveCameras class docs. For anyone facing the same issue, here is the code I used to get a minimal working example.
import torch
import numpy as np
import cv2
from pytorch3d.ops import efficient_pnp
from pytorch3d.renderer.cameras import look_at_view_transform
from pytorch3d.transforms import axis_angle_to_matrix
from pytorch3d.renderer import PerspectiveCameras
##########################################################################################
##########################################################################################
##########################################################################################
# create camera
#
# camera_pos [3] xyz
# img_size scalar image size
# focal_length scalar focal length
def make_camera(camera_pos,img_size,focal_length,DEVICE):
r,t = look_at_view_transform(
dist=camera_pos[2],
elev=0,
azim=0)
# create projection matrix according to pytorch3d convention. Under PerspectiveCameras
# https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/renderer/cameras.html
p_matrix = np.array([focal_length, 0.0, img_size // 2, 0.0,
0.0, focal_length, img_size // 2, 0.0,
0.0, 0.0, 0.0, 1.0,
0.0, 0.0, 1.0, 0.0],dtype=np.float32).reshape(4,4)
p_matrix = torch.tensor(p_matrix).float().unsqueeze(0)
cameras = PerspectiveCameras(device=DEVICE,
R=r,
T=t,
K = p_matrix,
in_ndc = False,
image_size=(img_size,img_size)
)
return cameras
# create intrinsic camera matrix from focal length and image width/height
def make_intrinsic_matrix(img_size,focal):
s = img_size // 2
p_matrix = np.array([focal, 0.0, s,
0.0, focal, s,
0.0, 0.0, 1.0],dtype=np.float32)
return torch.tensor(p_matrix.reshape(3,3))
###################################################################
if __name__ == '__main__':
# create camera parameters
IMG_SIZE = 256
CAMERA_POS = [0.0,0.0,-20.0]
FOCAL_LENGTH = 2000
DEVICE='cuda'
# create dummy points
q = torch.tensor(
[[0,0,0],
[0,0.3,1],
[0,1,0],
[1,0,0],
[0,0,-1],
[0,-1,0],
[-1,0,0]]).float().cuda().unsqueeze(0)
# create camera used for rendering
cam = make_camera(CAMERA_POS, IMG_SIZE, FOCAL_LENGTH, DEVICE)
# create constants
P_MAT = make_intrinsic_matrix(IMG_SIZE,-FOCAL_LENGTH).to(DEVICE)
K = np.ascontiguousarray(P_MAT.detach().cpu().numpy()).reshape((3,3))
# create custom rotation and translation and apply to get 3d and 2d points
axis_angle = torch.tensor([np.pi / 4.0, -np.pi/4.0, np.pi / 4.0]).float().unsqueeze(0).cuda()
my_R = axis_angle_to_matrix(axis_angle).permute(0,2,1)
my_T = torch.tensor([0.0,0.3,0.5]).float().unsqueeze(0).cuda()
q_rt = torch.matmul(q,my_R) + my_T.unsqueeze(1)
b = cam.transform_points_screen(q_rt)
num_batch = b.shape[0]
num_pts = b.shape[1]
# find rotation and translation
# opencv
p3d = q.detach().cpu().numpy()
p2d = b[...,:2].detach().cpu().numpy()
for i in range(num_batch):
p = np.ascontiguousarray(p2d[i].reshape((num_pts,1,2)))
p_z = np.ascontiguousarray(p3d[0])
_, R, T = cv2.solvePnP(p_z,p, K, distCoeffs=None, flags=cv2.SOLVEPNP_EPNP)
print("opencv predicted rot: ", cv2.Rodrigues(-R)[0])
print("opencv precicted trans: ", T)
# find rotation and translation
# pytorch3d
P_MAT = make_intrinsic_matrix(IMG_SIZE,-FOCAL_LENGTH).to(DEVICE)
p3d = q_rt
p2d_h = torch.cat((b[...,:2], torch.ones(num_batch, num_pts, 1).to(b.device)), dim=-1)
p_inv = torch.inverse(P_MAT).T
p2d_uncal = torch.matmul(p2d_h, p_inv.unsqueeze(0))
transform = efficient_pnp(q.repeat(num_batch,1,1), p2d_uncal[...,:2])
print(transform.R)
print(transform.T)
print('done')
Hello, @yhu9
I have some difficulties with using the efficient_pnp() function as well.
I went through your comments. Do I understand correctly, that the main change is this negative focal length? Then you use this intrinsic matrix to go from pixel space, where the x-axis points right and the y-axis points down with the origin in the upper-left corner, to the rays in the camera coordinate frame, so-called uncalibrated 2d points. Is it correct?
Thanks.