pytorch3d icon indicating copy to clipboard operation
pytorch3d copied to clipboard

Working Example of Pytorch3d efficient_pnp() function

Open yhu9 opened this issue 3 years ago • 2 comments

❓ Questions on how to use PyTorch3D

Hi, I'm trying to get a minimal version of the efficient_pnp algorithm to work using the pytorch3d library. Given my own custom rotation and translation, I want efficient_pnp to recover the rotation and translation based on the 3D and 2D points given by the camera. Any suggestions to make my current code to work?

import torch
import numpy as np

from pytorch3d.ops import efficient_pnp
from pytorch3d.renderer.cameras import look_at_view_transform
from pytorch3d.transforms import euler_angles_to_matrix
from pytorch3d.renderer import PerspectiveCameras

##########################################################################################
##########################################################################################
##########################################################################################

# create camera
#
# camera_pos        [3]         xyz
# img_size          scalar      image size
# focal_length      scalar      focal length
def make_camera(camera_pos,img_size,focal_length,DEVICE):
    r,t = look_at_view_transform(
        dist=camera_pos[2],
        elev=0,
        azim=0)

    # create projection matrix according to pytorch3d convention. Under PerspectiveCameras
    # https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/renderer/cameras.html
    p_matrix = np.array([focal_length, 0.0, img_size // 2, 0.0, 
                        0.0, focal_length, img_size // 2, 0.0, 
                        0.0, 0.0, 0.0, 1.0,
                        0.0, 0.0, 1.0, 0.0],dtype=np.float32).reshape(4,4)
    p_matrix = torch.tensor(p_matrix).float().unsqueeze(0)

    cameras = PerspectiveCameras(device=DEVICE, 
        R=r,
        T=t,
        K = p_matrix,
        in_ndc = False,
        image_size=(img_size,img_size)
        )

    return cameras

# create intrinsic camera matrix from focal length and image width/height
def make_intrinsic_matrix(img_size,focal):
    s = img_size // 2
    p_matrix = np.array([focal, 0.0, s, 
                        0.0, focal, s, 
                        0.0, 0.0, 1.0],dtype=np.float32)

    return torch.tensor(p_matrix.reshape(3,3))


###################################################################
if __name__ == '__main__':

    # create camera parameters
    IMG_SIZE = 256
    CAMERA_POS = [0.0,0.0,-20.0]
    FOCAL_LENGTH = 2000
    DEVICE='cuda'
    P_MAT = make_intrinsic_matrix(IMG_SIZE,FOCAL_LENGTH).to(DEVICE)

    # create dummy points
    q = torch.tensor(
        [[0,0,0],
        [0,0,1],
        [0,1,0],
        [1,0,0],
        [0,0,-1],
        [0,-1,0],
        [-1,0,0]]).float().cuda().unsqueeze(0)

    # create camera used for rendering
    cam = make_camera(CAMERA_POS, IMG_SIZE, FOCAL_LENGTH, DEVICE)

    # transform_points_screen(q)
    # cam.transform_points(q) does not give same result as cam.transform_points_screen(q)
    # a = cam.transform_points(q)
    b = cam.transform_points_screen(q)
    
    # for some reason it creates 2 instead of projecting q once
    batch_size = b.shape[0]
    if q.shape[0] == 1:
        q = q.repeat(batch_size,1,1)

    # uncalibrate 2D points
    b[...,2] = 1
    k_inv = torch.inverse(P_MAT.T)
    b_cam = torch.matmul(b,k_inv.unsqueeze(0))

    # b_cam = torch.matmul(b_cam,flip_matrix)

    # since we don't rotate or translate the 3D shape, the rotation matrix is identity
    # and translation vector should be [0,0,0]
    # skipping quadratic equation seems to be more accurate?
    transform = efficient_pnp(q, b_cam[...,:2])
    print(transform.R)
    print(transform.T)

    # rotation seems to indicate flipping of x, y axis
    euler_angles = torch.tensor([np.pi / 4.0, np.pi/4.0, np.pi / 2.0]).float().unsqueeze(0).cuda()
    my_R = euler_angles_to_matrix(euler_angles,'XYZ')
    my_T = torch.tensor([0.0,0.2,0.5]).float().unsqueeze(0).cuda() 

    q_rt = torch.matmul(q,my_R) + my_T.unsqueeze(1)
    b_cam = cam.transform_points_screen(q_rt)
    b_cam[...,2] = 1
    b_cam = torch.matmul(b_cam,k_inv.unsqueeze(0))

    # check to see if we can get back the same rotation
    transform = efficient_pnp(q_rt, b_cam[...,:2])
    print(transform.R)
    print(transform.T)
    print("my R: ", my_R)
    print("my T: ", my_T)

yhu9 avatar Sep 08 '22 19:09 yhu9

After some digging, I found a way to get this to work, and compared it with opencv version. To use this for perspective cameras, a negative focal length is applied due to the flipping from pytorch3d's axis convention to image space convention. Since I define my camera in screen space (aka image space), a negative focal length is used to project from world to screen space in pytorch3d's implementation of PerspectiveCameras.transform_points_screen(). Personally I think this should be stated in either the efficient_pnp() function docs or the PerspectiveCameras class docs. For anyone facing the same issue, here is the code I used to get a minimal working example.

import torch
import numpy as np
import cv2

from pytorch3d.ops import efficient_pnp
from pytorch3d.renderer.cameras import look_at_view_transform
from pytorch3d.transforms import axis_angle_to_matrix
from pytorch3d.renderer import PerspectiveCameras

##########################################################################################
##########################################################################################
##########################################################################################

# create camera
#
# camera_pos        [3]         xyz
# img_size          scalar      image size
# focal_length      scalar      focal length
def make_camera(camera_pos,img_size,focal_length,DEVICE):
    r,t = look_at_view_transform(
        dist=camera_pos[2],
        elev=0,
        azim=0)

    # create projection matrix according to pytorch3d convention. Under PerspectiveCameras
    # https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/renderer/cameras.html
    p_matrix = np.array([focal_length, 0.0, img_size // 2, 0.0, 
                        0.0, focal_length, img_size // 2, 0.0, 
                        0.0, 0.0, 0.0, 1.0,
                        0.0, 0.0, 1.0, 0.0],dtype=np.float32).reshape(4,4)
    p_matrix = torch.tensor(p_matrix).float().unsqueeze(0)

    cameras = PerspectiveCameras(device=DEVICE, 
        R=r,
        T=t,
        K = p_matrix,
        in_ndc = False,
        image_size=(img_size,img_size)
        )

    return cameras

# create intrinsic camera matrix from focal length and image width/height
def make_intrinsic_matrix(img_size,focal):
    s = img_size // 2
    p_matrix = np.array([focal, 0.0, s, 
                        0.0, focal, s, 
                        0.0, 0.0, 1.0],dtype=np.float32)

    return torch.tensor(p_matrix.reshape(3,3))


###################################################################
if __name__ == '__main__':

    # create camera parameters
    IMG_SIZE = 256
    CAMERA_POS = [0.0,0.0,-20.0]
    FOCAL_LENGTH = 2000
    DEVICE='cuda'
    
    # create dummy points
    q = torch.tensor(
        [[0,0,0],
        [0,0.3,1],
        [0,1,0],
        [1,0,0],
        [0,0,-1],
        [0,-1,0],
        [-1,0,0]]).float().cuda().unsqueeze(0)

    # create camera used for rendering
    cam = make_camera(CAMERA_POS, IMG_SIZE, FOCAL_LENGTH, DEVICE)
    
    # create constants
    P_MAT = make_intrinsic_matrix(IMG_SIZE,-FOCAL_LENGTH).to(DEVICE)
    K = np.ascontiguousarray(P_MAT.detach().cpu().numpy()).reshape((3,3))
    
    # create custom rotation and translation and apply to get 3d and 2d points
    axis_angle = torch.tensor([np.pi / 4.0, -np.pi/4.0, np.pi / 4.0]).float().unsqueeze(0).cuda()
    my_R = axis_angle_to_matrix(axis_angle).permute(0,2,1)
    my_T = torch.tensor([0.0,0.3,0.5]).float().unsqueeze(0).cuda()
    q_rt = torch.matmul(q,my_R) + my_T.unsqueeze(1)
    b = cam.transform_points_screen(q_rt)
    num_batch = b.shape[0]
    num_pts = b.shape[1]

    # find rotation and translation
    # opencv
    p3d = q.detach().cpu().numpy()
    p2d = b[...,:2].detach().cpu().numpy()
    for i in range(num_batch):
        p = np.ascontiguousarray(p2d[i].reshape((num_pts,1,2)))
        p_z = np.ascontiguousarray(p3d[0])
        _, R, T = cv2.solvePnP(p_z,p, K, distCoeffs=None, flags=cv2.SOLVEPNP_EPNP)
        print("opencv predicted rot: ", cv2.Rodrigues(-R)[0])
        print("opencv precicted trans: ", T)
    
    # find rotation and translation
    # pytorch3d
    P_MAT = make_intrinsic_matrix(IMG_SIZE,-FOCAL_LENGTH).to(DEVICE)
    p3d = q_rt
    p2d_h = torch.cat((b[...,:2], torch.ones(num_batch, num_pts, 1).to(b.device)), dim=-1)
    p_inv = torch.inverse(P_MAT).T
    p2d_uncal = torch.matmul(p2d_h, p_inv.unsqueeze(0))
    transform = efficient_pnp(q.repeat(num_batch,1,1), p2d_uncal[...,:2])
    print(transform.R)
    print(transform.T)

    print('done')

yhu9 avatar Sep 13 '22 15:09 yhu9

Hello, @yhu9

I have some difficulties with using the efficient_pnp() function as well.

I went through your comments. Do I understand correctly, that the main change is this negative focal length? Then you use this intrinsic matrix to go from pixel space, where the x-axis points right and the y-axis points down with the origin in the upper-left corner, to the rays in the camera coordinate frame, so-called uncalibrated 2d points. Is it correct?

Thanks.

gsvirskiy avatar Feb 12 '24 10:02 gsvirskiy