HybrIK
HybrIK copied to clipboard
How can I test?
In the past issue, i found your answer says "Hi, if you want to test with your own data, you need to run object detection to generate human bounding boxes. Then use RootNet to predict the root joint in each bounding box. Combing box and root joint, you can run our model to predict the final SMPL results."
so, i used Rootnet and output bbox and root joint stil there are other unknown variables such as trans_inv and depth factor how can i find it?
Hi @seoha-kim ,
You can refer to L222, L224 and L246. As long as the annotation files provide bbox
, cam_param
and root_coord
, I think the dataloader can do the rest part.
Besides, we plan to release the code for testing on the in-the-wild images later. But still, these three terms are needed.
Hi Jeff You mean that if i have bbox, cam_param(focal_lenght, center), and root_coordnate(from RootNet), i can test on in-the-wild-images? then should i modify h36m-smpl dataloader for in-the-wild-images?
Yes, that's what I mean. Here is our manuscript code. There might be some variable names that should be modified for compatibility with the current code.
import copy
import json
import os
import pickle as pk
import cv2
import numpy as np
from PIL import Image
import scipy.misc
import torch.utils.data as data
from pytpose.utils.bbox import bbox_clip_xyxy, bbox_xywh_to_xyxy
from pytpose.utils.pose_utils import (cam2pixel, pixel2cam, rigid_align,
vis_keypoints, reconstruction_error)
from pytpose.utils.presets import SimpleTransform3DSMPL
class InferSMPL(data.Dataset):
""" dummy inference smpl dataset. 17 Human3.6M joints + 24 SMPL joints
Parameters
----------
ann_file: str,
Path to the annotation json file.
root: str, default './data/h36m'
Path to the h36m dataset.
train: bool, default is True
If true, will set as training mode.
skip_empty: bool, default is False
Whether skip entire image if no valid label is found.
"""
CLASSES = ['person']
# EVAL_JOINTS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
EVAL_JOINTS = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10]
num_joints = 17 + 24
num_thetas = 24
bbox_3d_shape = (2000, 2000, 2000)
joints_name_17 = (
'Pelvis', # 0
'L_Hip', 'L_Knee', 'L_Ankle', # 3
'R_Hip', 'R_Knee', 'R_Ankle', # 6
'Torso', 'Neck', # 8
'Nose', 'Head', # 10
'L_Shoulder', 'L_Elbow', 'L_Wrist', # 13
'R_Shoulder', 'R_Elbow', 'R_Wrist', # 16
)
joints_name_24 = (
'pelvis', 'left_hip', 'right_hip', # 2
'spine1', 'left_knee', 'right_knee', # 5
'spine2', 'left_ankle', 'right_ankle', # 8
'spine3', 'left_foot', 'right_foot', # 11
'neck', 'left_collar', 'right_collar', # 14
'jaw', # 15
'left_shoulder', 'right_shoulder', # 17
'left_elbow', 'right_elbow', # 19
'left_wrist', 'right_wrist', # 21
'left_thumb', 'right_thumb' # 23
)
joints_name_14 = (
'R_Ankle', 'R_Knee', 'R_Hip', # 2
'L_Hip', 'L_Knee', 'L_Ankle', # 5
'R_Wrist', 'R_Elbow', 'R_Shoulder', # 8
'L_Shoulder', 'L_Elbow', 'L_Wrist', # 11
'Neck', 'Head'
)
action_name = ['Directions', 'Discussion', 'Eating', 'Greeting', 'Phoning', 'Posing', 'Purchases',
'Sitting', 'SittingDown', 'Smoking', 'Photo', 'Waiting', 'Walking', 'WalkDog', 'WalkTogether']
skeleton = (
(1, 0), (2, 1), (3, 2), # 2
(4, 0), (5, 4), (6, 5), # 5
(7, 0), (8, 7), # 7
(9, 8), (10, 9), # 9
(11, 7), (12, 11), (13, 12), # 12
(14, 7), (15, 14), (16, 15), # 15
)
def __init__(self,
cfg,
img_list,
bbox_list,
root_cam_list):
self._cfg = cfg
self._img_list = img_list
self._bbox_list = bbox_list
self._root_cam_list = root_cam_list
self._train = False
self._dpg = False
self._det_bbox_file = getattr(cfg.DATASET.SET_LIST[0], 'DET_BOX', None)
self._scale_factor = cfg.DATASET.SCALE_FACTOR
self._color_factor = cfg.DATASET.COLOR_FACTOR
self._rot = cfg.DATASET.ROT_FACTOR
self._input_size = cfg.MODEL.IMAGE_SIZE
self._output_size = cfg.MODEL.HEATMAP_SIZE
self._occlusion = cfg.DATASET.OCCLUSION
self._crop = cfg.MODEL.EXTRA.CROP
self._sigma = cfg.MODEL.EXTRA.SIGMA
self._depth_dim = getattr(cfg.MODEL.EXTRA, 'DEPTH_DIM', None)
self._check_centers = False
self.num_class = len(self.CLASSES)
self.num_joints = cfg.MODEL.NUM_JOINTS
self.num_joints_half_body = cfg.DATASET.NUM_JOINTS_HALF_BODY
self.prob_half_body = cfg.DATASET.PROB_HALF_BODY
self.augment = cfg.MODEL.EXTRA.AUGMENT
self.dz_factor = cfg.MODEL.EXTRA.get('FACTOR', None)
self._loss_type = cfg.LOSS['TYPE']
self.upper_body_ids = (7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
self.lower_body_ids = (0, 1, 2, 3, 4, 5, 6)
self.kinematic = cfg.MODEL.EXTRA.get('KINEMATIC', False)
self.classfier = cfg.MODEL.EXTRA.get('WITHCLASSFIER', False)
self.root_idx_17 = self.joints_name_17.index('Pelvis')
self.lshoulder_idx_17 = self.joints_name_17.index('L_Shoulder')
self.rshoulder_idx_17 = self.joints_name_17.index('R_Shoulder')
self.root_idx_24 = self.joints_name_24.index('pelvis')
self.lshoulder_idx_24 = self.joints_name_24.index('left_shoulder')
self.rshoulder_idx_24 = self.joints_name_24.index('right_shoulder')
self._items, self._labels = self._load_jsons()
if cfg.MODEL.EXTRA.PRESET == 'simple_smpl_3d':
self.transformation = SimpleTransform3DSMPL(
self, scale_factor=self._scale_factor,
color_factor=self._color_factor,
occlusion=self._occlusion,
input_size=self._input_size,
output_size=self._output_size,
depth_dim=self._depth_dim,
bbox_3d_shape=self.bbox_3d_shape,
rot=self._rot, sigma=self._sigma,
train=self._train, add_dpg=self._dpg,
loss_type=self._loss_type, scale_mult=1)
def __getitem__(self, idx):
# get image id
img_path = self._items[idx]
img_id = self._labels[idx]['img_id']
# load ground truth, including bbox, keypoints, image size
label = copy.deepcopy(self._labels[idx])
img = scipy.misc.imread(img_path, mode='RGB')
# img = load_image(img_path)
# img = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
# transform ground truth into training label and apply data augmentation
target = self.transformation(img, label)
if isinstance(target, dict):
img = target.pop('image')
bbox = target.pop('bbox')
return img, target, img_id, bbox
else:
img, label, label_mask, bbox = target
return img, label, label_mask, img_id, bbox
def __len__(self):
return len(self._items)
def _load_jsons(self):
"""Load all image paths and labels from JSON annotation files into buffer."""
items = []
labels = []
img_list = self._img_list
# iterate through the annotations
bbox_scale_list = []
for i in range(len(img_list)):
img_path = img_list[i]
image_id = os.path.basename(img_path)
img = Image.open(img_path)
width, height = img.size[0], img.size[1]
if self._root_cam_list is not None:
root_cam = np.array(self._root_cam_list[i], dtype=np.float32)
else:
root_cam = np.array([0, 0, 5000], dtype=np.float32)
if self._bbox_list is not None:
xmin, ymin, xmax, ymax = bbox_clip_xyxy(
bbox_xywh_to_xyxy(self._bbox_list[i]), width, height)
else:
xmin, ymin, xmax, ymax = bbox_clip_xyxy(
bbox_xywh_to_xyxy([1,1,width-2,height-2]), width, height)
f = np.array([1500, 1500], dtype=np.float32)
c = np.array([width / 2, height / 2], dtype=np.float32)
items.append(img_path)
labels.append({
'bbox': (xmin, ymin, xmax, ymax),
'img_id': image_id,
'img_path': img_path,
'width': width,
'height': height,
'root_cam': root_cam,
'joint_img_17': np.zeros((17, 3), dtype=np.float32),
'joint_vis_17': np.ones((17, 3), dtype=np.float32),
'joint_cam_17': np.zeros((17, 3), dtype=np.float32),
'joint_relative_17': np.zeros((17, 3), dtype=np.float32),
'joint_img_24': np.zeros((24, 3), dtype=np.float32),
'joint_vis_24': np.ones((24, 3), dtype=np.float32),
'joint_cam_24': np.zeros((24, 3), dtype=np.float32),
'beta': np.zeros((10), dtype=np.float32),
'theta': np.zeros((24, 3), dtype=np.float32),
'f': f,
'c': c
})
bbox_scale_list.append(max(xmax - xmin, ymax - ymin))
return items, labels
@property
def joint_pairs_17(self):
"""Joint pairs which defines the pairs of joint to be swapped
when the image is flipped horizontally."""
return ((1, 4), (2, 5), (3, 6), (11, 14), (12, 15), (13, 16))
@property
def joint_pairs_24(self):
"""Joint pairs which defines the pairs of joint to be swapped
when the image is flipped horizontally."""
return ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23))
@property
def bone_pairs(self):
"""Bone pairs which defines the pairs of bone to be swapped
when the image is flipped horizontally."""
return ((0, 3), (1, 4), (2, 5), (10, 13), (11, 14), (12, 15))
def _get_box_center_area(self, bbox):
"""Get bbox center"""
c = np.array([(bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0])
area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
return c, area
def _get_keypoints_center_count(self, keypoints):
"""Get geometric center of all keypoints"""
keypoint_x = np.sum(keypoints[:, 0, 0] * (keypoints[:, 0, 1] > 0))
keypoint_y = np.sum(keypoints[:, 1, 0] * (keypoints[:, 1, 1] > 0))
num = float(np.sum(keypoints[:, 0, 1]))
return np.array([keypoint_x / num, keypoint_y / num]), num
Oh, thank you so much. I'll test it using that code.
Hi @Jeff-sjtu Thank you for your inference code. I have 3 questions regarding the above code:
- Does the root_cam_list means root coord from RootNet?
- Is the cam_params (f,c) fixed for all images? If not, how can I get cam_params?
- Which cfg file could be used for inference?