Fast-BEV
Fast-BEV copied to clipboard
Oddly looking ground-truth data during visualization
Image example
After training the model and running the visualization, I noticed that the ground-truth bounding boxes look odd (do not cover the objects precisely, some look completely off etc.). Is it supposed to be like this, or this is a bug? For reference, here is the visualization code (slightly differs from the provided in this repo):
Code
class NuScenesMultiViewDataset(MultiViewMixin, NuScenesSeqDataset):
def get_data_info(self, index):
data_info = super().get_data_info(index)
n_cameras = len(data_info['img_filename'])
self.sensor_count = n_cameras # for loops
if not self.sequential and not self.only_front_3:
assert n_cameras == 6
elif not self.sequential and self.only_front_3:
assert n_cameras == 3
new_info = dict(
sample_idx=data_info['sample_idx'],
img_prefix=[None] * n_cameras,
img_info=[dict(filename=x) for x in data_info['img_filename']],
lidar2img=dict(
extrinsic=[tofloat(x) for x in data_info['lidar2img']],
intrinsic=np.eye(4, dtype=np.float32),
lidar2img_aug=data_info['lidar2img_aug'],
lidar2img_extra=data_info['lidar2img_extra']
)
)
if 'ann_info' in data_info:
gt_bboxes_3d = data_info['ann_info']['gt_bboxes_3d']
gt_labels_3d = data_info['ann_info']['gt_labels_3d'].copy()
mask = gt_labels_3d >= 0
gt_bboxes_3d = gt_bboxes_3d[mask]
gt_names = data_info['ann_info']['gt_names'][mask]
gt_labels_3d = gt_labels_3d[mask]
new_info['ann_info'] = dict(
gt_bboxes_3d=gt_bboxes_3d,
gt_names=gt_names,
gt_labels_3d=gt_labels_3d
)
return new_info
def evaluate(self, results, *args, **kwargs):
# update boxes with zero velocity
new_results = []
for i in range(len(results)):
box_type = type(results[i]['boxes_3d'])
boxes_3d = results[i]['boxes_3d'].tensor
boxes_3d = box_type(boxes_3d, box_dim=9, origin=(0.5, 0.5, 0)).convert_to(self.box_mode_3d)
new_results.append(dict(
boxes_3d=boxes_3d,
scores_3d=results[i]['scores_3d'],
labels_3d=results[i]['labels_3d']
))
vis_mode = kwargs['vis_mode'] if 'vis_mode' in kwargs else False
if vis_mode:
embed(header='### vis nus test data ###')
print('### vis nus test data ###')
self.show(new_results, 'trash/test', thr=0.3)
print('### finish vis ###')
exit()
if 'vis_mode' in kwargs.keys():
kwargs.pop('vis_mode')
result_dict = super().evaluate(new_results, *args, **kwargs)
print(result_dict)
return result_dict
@staticmethod
def draw_corners(img, corners, color, projection):
corners_3d_4 = np.concatenate((corners, np.ones((8, 1))), axis=1)
corners_2d_3 = corners_3d_4 @ projection.T
z_mask = corners_2d_3[:, 2] > 0
corners_2d = corners_2d_3[:, :2] / corners_2d_3[:, 2:]
corners_2d = corners_2d.astype(np.int)
for i, j in [
[0, 1], [1, 2], [2, 3], [3, 0],
[4, 5], [5, 6], [6, 7], [7, 4],
[0, 4], [1, 5], [2, 6], [3, 7]
]:
if z_mask[i] and z_mask[j]:
img = cv2.line(
img=img,
pt1=tuple(corners_2d[i]),
pt2=tuple(corners_2d[j]),
color=color,
thickness=2,
lineType=cv2.LINE_AA)
# drax `X' in the front
if z_mask[0] and z_mask[5]:
img = cv2.line(
img=img,
pt1=tuple(corners_2d[0]),
pt2=tuple(corners_2d[5]),
color=color,
thickness=2,
lineType=cv2.LINE_AA)
if z_mask[1] and z_mask[4]:
img = cv2.line(
img=img,
pt1=tuple(corners_2d[1]),
pt2=tuple(corners_2d[4]),
color=color,
thickness=2,
lineType=cv2.LINE_AA)
def draw_bev_bbox_corner(self, img, box, color, scale_fac):
box = box[:, None, :] # [4,1,2]
box = box + 50
box = box * scale_fac
box = np.int0(box)
img = cv2.polylines(img, [box], isClosed=True, color=color, thickness=2)
return img
def show(self, results, out_dir='trash', bev_seg_results=None, thr=0.3, fps=3, make_video=False):
assert out_dir is not None, 'Expect out_dir, got none.'
colors = get_colors()
all_img_gt, all_img_pred, all_bev_gt, all_bev_pred = [], [], [], []
for i, result in enumerate(results):
info = self.get_data_info(i)
gt_bboxes = self.get_ann_info(i)
print('saving image {}/{} to {}'.format(i, len(results), out_dir))
# draw 3d box in BEV
scale_fac = 10
out_file_dir = str(i)
###### draw BEV pred ######
bev_pred_img = np.zeros((100*scale_fac, 100*scale_fac, 3))
if bev_seg_results is not None:
bev_pred_road, bev_pred_lane = bev_seg_results[i]['seg_pred_road'], bev_seg_results[i]['seg_pred_lane']
bev_pred_img = map2lssmap(bev_pred_road, bev_pred_lane)
bev_pred_img = mmcv.imresize(bev_pred_img,
(100*scale_fac, 100*scale_fac),
interpolation='bilinear')
scores = result['scores_3d'].numpy()
try:
bev_box_pred = result['boxes_3d'].corners.numpy()[:, [0, 2, 6, 4]][..., :2][scores > thr]
labels = result['labels_3d'].numpy()[scores > thr]
assert bev_box_pred.shape[0] == labels.shape[0]
for idx in range(len(labels)):
bev_pred_img = self.draw_bev_bbox_corner(bev_pred_img, bev_box_pred[idx], colors[labels[idx]], scale_fac)
except:
pass
bev_pred_img = process_bev_res_in_front(bev_pred_img)
imsave(os.path.join(out_dir, out_file_dir, 'bev_pred.png'), mmcv.imrescale(bev_pred_img, 0.5))
bev_gt_img = np.zeros((100*scale_fac, 100*scale_fac, 3))
if bev_seg_results is not None:
sample_token = self.get_data_info(i)['sample_idx']
bev_seg_gt = self._get_map_by_sample_token(sample_token).astype('uint8')
bev_gt_road, bev_gt_lane = bev_seg_gt[...,0], bev_seg_gt[...,1]
bev_seg_gt = map2lssmap(bev_gt_road, bev_gt_lane)
bev_gt_img = mmcv.imresize(
bev_seg_gt,
(100*scale_fac, 100*scale_fac),
interpolation='bilinear')
try:
# draw BEV GT
bev_gt_bboxes = gt_bboxes['gt_bboxes_3d'].corners.numpy()[:,[0,2,6,4]][..., :2]
labels_gt = gt_bboxes['gt_labels_3d']
for idx in range(len(labels_gt)):
bev_gt_img = self.draw_bev_bbox_corner(bev_gt_img, bev_gt_bboxes[idx], colors[labels_gt[idx]], scale_fac)
except:
pass
bev_gt_img = process_bev_res_in_front(bev_gt_img)
imsave(os.path.join(out_dir, out_file_dir, 'bev_gt.png'), mmcv.imrescale(bev_gt_img, 0.5))
all_bev_gt.append(mmcv.imrescale(bev_gt_img, 0.5))
all_bev_pred.append(mmcv.imrescale(bev_pred_img, 0.5))
###### draw BEV pred ######
###### draw 3d box in image ######
img_gt_list = []
img_pred_list = []
for j in range(0, self.sensor_count): # we want only the first {sensor_count} frames in the time sequence
img_pred = imread(info['img_info'][j]['filename'])
img_gt = imread(info['img_info'][j]['filename'])
# camera name
camera_name = info['img_info'][j]['filename'].split('/')[-2]
puttext(img_pred, camera_name)
puttext(img_gt, camera_name)
extrinsic = info['lidar2img']['extrinsic'][j]
intrinsic = info['lidar2img']['intrinsic'][:3, :3]
projection = intrinsic @ extrinsic[:3]
if not len(result['scores_3d']):
pass
else:
# draw pred
corners = result['boxes_3d'].corners.numpy()
#print(f"PRED CORNERS SHAPE: {corners.shape}")
scores = result['scores_3d'].numpy()
labels = result['labels_3d'].numpy()
for corner, score, label in zip(corners, scores, labels):
if score < thr:
continue
try:
self.draw_corners(img_pred, corner, colors[label], projection)
except:
pass
try:
# draw GT
corners = gt_bboxes['gt_bboxes_3d'].corners.numpy()
#print(f"GT CORNERS SHAPE: {corners.shape}")
labels = gt_bboxes['gt_labels_3d']
for corner, label in zip(corners, labels):
self.draw_corners(img_gt, corner, colors[label], projection)
except:
pass
out_file_dir = str(i)
mmcv.mkdir_or_exist(os.path.join(out_dir, out_file_dir))
# 缩小image大小 可视化方便一些
img_gt_pred = np.concatenate([img_gt, img_pred], 0)
imsave(os.path.join(out_dir, out_file_dir, '{}_gt_pred.png'.format(j)), mmcv.imrescale(img_gt_pred, 0.5))
img_gt_list.append(mmcv.imrescale(img_gt, 0.5))
img_pred_list.append(mmcv.imrescale(img_pred, 0.5))
###### draw 3d box in image ######
###### generate videos step:1 ######
if make_video:
assert self.sensor_count == 6, "Video generation is only implemented for the 6-camera setup."
assert self.sequential == False, "Video generation is not implemented for sequential features."
tmp_img_up_pred = np.concatenate(sort_list(img_pred_list[0:3], sort=[2,0,1]), axis=1)
tmp_img_bottom_pred = np.concatenate(sort_list(img_pred_list[3:6], sort=[2,0,1]) ,axis=1)
tmp_img_pred = np.concatenate([tmp_img_up_pred, tmp_img_bottom_pred], axis=0)
all_img_pred.append(tmp_img_pred)
tmp_img_up_gt = np.concatenate(sort_list(img_gt_list[0:3], sort=[2,0,1]),axis=1)
tmp_img_bottom_gt = np.concatenate(sort_list(img_gt_list[3:6], sort=[2,0,1]),axis=1)
tmp_img_gt = np.concatenate([tmp_img_up_gt, tmp_img_bottom_gt], axis=0)
all_img_gt.append(tmp_img_gt)
###### generate videos step:1 ######
if make_video:
###### generate videos step:2 ######
gen_video(all_img_pred, all_bev_pred, out_dir, 'pred', fps=fps)
gen_video(all_img_gt, all_bev_gt, out_dir, 'gt', fps=fps)
###### generate videos step:2 ######
The changes are motivated by the bug I encountered previously while trying to run the visualization for the sequential model.
May be important for the context: I also replaced nms_gpu with nms_rotate from mmcv (I am using slightly different package versions), could this affect the ground truth?
Note: nms_rotate uses xywhr format, I have accounted for that by removing xywhr2xyxyr, so that should not be an issue. Additionally, the mAP for the model looks reasonably close to the reported in the paper, only the visualizations are weird.
Where you able to solve it? My results/gt look very similar and not at all how it should look like
Where you able to solve it? My results/gt look very similar and not at all how it should look like
Not really, I gave up :) Strongly suspect that there is some kind of bug in the visualization implementation, but I was unable to locate it.