mujoco-py icon indicating copy to clipboard operation
mujoco-py copied to clipboard

convert 3d point to its 2d pixel projection

Open anat-T opened this issue 11 months ago • 0 comments

Hi, I am trying to convert 3d points, which represents objects positions from mujoco env, to their 2d projection in pixels.

what I tried to do (code below):

_FLOAT_EPS = np.finfo(np.float64).eps
_EPS4 = _FLOAT_EPS * 4.0

def mat2euler(mat):
    mat = np.asarray(mat, dtype=np.float64)
    assert mat.shape[-2:] == (3, 3), "Invalid shape matrix {}".format(mat)

    cy = np.sqrt(mat[..., 2, 2] * mat[..., 2, 2] + mat[..., 1, 2] * mat[..., 1, 2])
    condition = cy > _EPS4
    euler = np.empty(mat.shape[:-1], dtype=np.float64)
    euler[..., 2] = np.where(
        condition,
        -np.arctan2(mat[..., 0, 1], mat[..., 0, 0]),
        -np.arctan2(-mat[..., 1, 0], mat[..., 1, 1]),
    )
    euler[..., 1] = np.where(
        condition, -np.arctan2(-mat[..., 0, 2], cy), -np.arctan2(-mat[..., 0, 2], cy)
    )
    euler[..., 0] = np.where(
        condition, -np.arctan2(mat[..., 1, 2], mat[..., 2, 2]), 0.0
    )
    return euler

def get_global_position_of_cube(env, name):
  id = env._mj_data.body(name).id
  position_world = env._mj_data.xpos[id]
  return {"name":name, "pos":position_world}

def global2label(obj_pos, cam_pos, cam_ori, output_size=[64, 64], fov=90, s=1):
    """
    :param obj_pos: 3D coordinates of the joint from MuJoCo in nparray [m]
    :param cam_pos: 3D coordinates of the camera from MuJoCo in nparray [m]
    :param cam_ori: camera 3D rotation (Rotation order of x->y->z) from MuJoCo in nparray [rad]
    :param fov: field of view in integer [degree]
    :return: Heatmap of the object in the 2D pixel space.
    """

    e = np.array([output_size[0]/2, output_size[1]/2, 1])
    fov = np.array([fov])

    # Converting the MuJoCo coordinate into typical computer vision coordinate.
    cam_ori_cv = np.array([cam_ori[1], cam_ori[0], -cam_ori[2]])
    obj_pos_cv = np.array([obj_pos[1], obj_pos[0], -obj_pos[2]])
    cam_pos_cv = np.array([cam_pos[1], cam_pos[0], -cam_pos[2]])

    obj_pos_in_2D, obj_pos_from_cam = get_2D_from_3D(obj_pos_cv, cam_pos_cv, cam_ori_cv, fov, e)
    label = gkern(output_size[0], output_size[1], (obj_pos_in_2D[1],obj_pos_in_2D[0]), sigma=s)
    return label, obj_pos_in_2D

def get_2D_from_3D(a, c, theta, fov, e):
    """
    :param a: 3D coordinates of the joint in nparray [m]
    :param c: 3D coordinates of the camera in nparray [m]
    :param theta: camera 3D rotation (Rotation order of x->y->z) in nparray [rad]
    :param fov: field of view in integer [degree]
    :param e: 
    :return:
        - (bx, by) ==> 2D coordinates of the obj [pixel]
        - d ==> 3D coordinates of the joint (relative to the camera) [m]
    """

    # Get the vector from camera to object in global coordinate.
    ac_diff = a - c

    # Rotate the vector in to camera coordinate
    x_rot = np.array([[1 ,0, 0],
                    [0, np.cos(theta[0]), np.sin(theta[0])],
                    [0, -np.sin(theta[0]), np.cos(theta[0])]])

    y_rot = np.array([[np.cos(theta[1]) ,0, -np.sin(theta[1])],
                [0, 1, 0],
                [np.sin(theta[1]), 0, np.cos(theta[1])]])

    z_rot = np.array([[np.cos(theta[2]) ,np.sin(theta[2]), 0],
                [-np.sin(theta[2]), np.cos(theta[2]), 0],
                [0, 0, 1]])

    transform = z_rot.dot(y_rot.dot(x_rot))
    d = transform.dot(ac_diff)    

    # scaling of projection plane using fov
    fov_rad = np.deg2rad(fov)    
    e[2] *= e[1]*1/np.tan(fov_rad/2.0)

    # Projection from d to 2D
    bx = e[2]*d[0]/(d[2]) + e[0]
    by = e[2]*d[1]/(d[2]) + e[1]

    return (bx, by), d

import numpy as np
import matplotlib.pyplot as plt
from projection2d import global2label

output_size = [480, 640] # Output size (Height and width) of the 2D projection label in pixel
s = 1 # std for heapmap signal

#positions contains all the positions of object in mujoco env, using get_global_position_of_cube for each one
for position in positions:
  obj_pos = position['pos']
  camera_pos = mujoco_sim.data.cam_xpos.reshape(-1)
  camera_xmat = mujoco_sim.data.cam_xmat.reshape(3,3).T
  cam_ori = mat2euler(camera_xmat)
  fov = env._env.sim.model.cam_fovy[0]

  label, obj_pos_in_2D = global2label(obj_pos, camera_pos, cam_ori, output_size, fov=fov, s=s)

  # print(480-obj_pos_in_2D[0], obj_pos_in_2D[1])

  plt.imshow(label)
  plt.show()
  plt.imshow(frames[len(frames)-1])
  plt.show()

I did sanity check on hard-coded example I made and it worked fine, but when using the data from mujoco env it not working, the 2d points I am getting aren't correct. What am I doing wrong?

anat-T avatar Dec 17 '24 18:12 anat-T