Fast-BEV Oddly looking ground-truth data during visualization

Image example

After training the model and running the visualization, I noticed that the ground-truth bounding boxes look odd (do not cover the objects precisely, some look completely off etc.). Is it supposed to be like this, or this is a bug? For reference, here is the visualization code (slightly differs from the provided in this repo):

Code

class NuScenesMultiViewDataset(MultiViewMixin, NuScenesSeqDataset):
    def get_data_info(self, index):
        data_info = super().get_data_info(index)
        n_cameras = len(data_info['img_filename'])
        
        self.sensor_count = n_cameras # for loops
        
        if not self.sequential and not self.only_front_3:
            assert n_cameras == 6
        elif not self.sequential and self.only_front_3:
            assert n_cameras == 3

        new_info = dict(
            sample_idx=data_info['sample_idx'],
            img_prefix=[None] * n_cameras,
            img_info=[dict(filename=x) for x in data_info['img_filename']],
            lidar2img=dict(
                extrinsic=[tofloat(x) for x in data_info['lidar2img']],
                intrinsic=np.eye(4, dtype=np.float32),
                lidar2img_aug=data_info['lidar2img_aug'],
                lidar2img_extra=data_info['lidar2img_extra']
            )
        )
        if 'ann_info' in data_info:
            gt_bboxes_3d = data_info['ann_info']['gt_bboxes_3d']
            gt_labels_3d = data_info['ann_info']['gt_labels_3d'].copy()
            mask = gt_labels_3d >= 0
            gt_bboxes_3d = gt_bboxes_3d[mask]
            gt_names = data_info['ann_info']['gt_names'][mask]
            gt_labels_3d = gt_labels_3d[mask]
            new_info['ann_info'] = dict(
                gt_bboxes_3d=gt_bboxes_3d,
                gt_names=gt_names,
                gt_labels_3d=gt_labels_3d
            )
        return new_info

    def evaluate(self, results, *args, **kwargs):
        # update boxes with zero velocity
        new_results = []
        for i in range(len(results)):
            box_type = type(results[i]['boxes_3d'])
            boxes_3d = results[i]['boxes_3d'].tensor
            boxes_3d = box_type(boxes_3d, box_dim=9, origin=(0.5, 0.5, 0)).convert_to(self.box_mode_3d)
    
            new_results.append(dict(
                boxes_3d=boxes_3d,
                scores_3d=results[i]['scores_3d'],
                labels_3d=results[i]['labels_3d']
            ))
        
        vis_mode = kwargs['vis_mode'] if 'vis_mode' in kwargs else False
        if vis_mode:
            embed(header='### vis nus test data ###')
            print('### vis nus test data ###')
            self.show(new_results, 'trash/test', thr=0.3)
            print('### finish vis ###')
            exit()
            
        if 'vis_mode' in kwargs.keys():
            kwargs.pop('vis_mode')
        
        result_dict = super().evaluate(new_results, *args, **kwargs)
        print(result_dict)
        return result_dict
    
    @staticmethod
    def draw_corners(img, corners, color, projection):
        corners_3d_4 = np.concatenate((corners, np.ones((8, 1))), axis=1)
        corners_2d_3 = corners_3d_4 @ projection.T
        z_mask = corners_2d_3[:, 2] > 0
        corners_2d = corners_2d_3[:, :2] / corners_2d_3[:, 2:]
        corners_2d = corners_2d.astype(np.int)
        for i, j in [
            [0, 1], [1, 2], [2, 3], [3, 0],
            [4, 5], [5, 6], [6, 7], [7, 4],
            [0, 4], [1, 5], [2, 6], [3, 7]
        ]:
            if z_mask[i] and z_mask[j]:
                img = cv2.line(
                    img=img,
                    pt1=tuple(corners_2d[i]),
                    pt2=tuple(corners_2d[j]),
                    color=color,
                    thickness=2,
                    lineType=cv2.LINE_AA)
        # drax `X' in the front
        if z_mask[0] and z_mask[5]:
            img = cv2.line(
                img=img,
                pt1=tuple(corners_2d[0]),
                pt2=tuple(corners_2d[5]),
                color=color,
                thickness=2,
                lineType=cv2.LINE_AA)
        if z_mask[1] and z_mask[4]:
            img = cv2.line(
                img=img,
                pt1=tuple(corners_2d[1]),
                pt2=tuple(corners_2d[4]),
                color=color,
                thickness=2,
                lineType=cv2.LINE_AA)

    def draw_bev_bbox_corner(self, img, box, color, scale_fac):
        box = box[:, None, :]  # [4,1,2]
        box = box + 50
        box = box * scale_fac
        box = np.int0(box)
        img = cv2.polylines(img, [box], isClosed=True, color=color, thickness=2)
        return img

    def show(self, results, out_dir='trash', bev_seg_results=None, thr=0.3, fps=3, make_video=False):
        assert out_dir is not None, 'Expect out_dir, got none.'
        colors = get_colors()
        all_img_gt, all_img_pred, all_bev_gt, all_bev_pred = [], [], [], []
        for i, result in enumerate(results):
            info = self.get_data_info(i)
            gt_bboxes = self.get_ann_info(i)
            print('saving image {}/{} to {}'.format(i, len(results), out_dir))
            # draw 3d box in BEV
            scale_fac = 10
            out_file_dir = str(i)
            ###### draw BEV pred ######
            bev_pred_img = np.zeros((100*scale_fac, 100*scale_fac, 3))
            if bev_seg_results is not None:
                bev_pred_road, bev_pred_lane = bev_seg_results[i]['seg_pred_road'], bev_seg_results[i]['seg_pred_lane']
                bev_pred_img = map2lssmap(bev_pred_road, bev_pred_lane)
                bev_pred_img = mmcv.imresize(bev_pred_img,
                                             (100*scale_fac, 100*scale_fac),
                                             interpolation='bilinear')
                
            scores = result['scores_3d'].numpy()
            try:
                bev_box_pred = result['boxes_3d'].corners.numpy()[:, [0, 2, 6, 4]][..., :2][scores > thr]
                labels = result['labels_3d'].numpy()[scores > thr]
                assert bev_box_pred.shape[0] == labels.shape[0]
                for idx in range(len(labels)):
                    bev_pred_img = self.draw_bev_bbox_corner(bev_pred_img, bev_box_pred[idx], colors[labels[idx]], scale_fac)
            except:
                pass
            
            bev_pred_img = process_bev_res_in_front(bev_pred_img)
            imsave(os.path.join(out_dir, out_file_dir, 'bev_pred.png'), mmcv.imrescale(bev_pred_img, 0.5))

            bev_gt_img = np.zeros((100*scale_fac, 100*scale_fac, 3))
            if bev_seg_results is not None:
                sample_token = self.get_data_info(i)['sample_idx']
                bev_seg_gt = self._get_map_by_sample_token(sample_token).astype('uint8')
                bev_gt_road, bev_gt_lane = bev_seg_gt[...,0], bev_seg_gt[...,1]
                bev_seg_gt = map2lssmap(bev_gt_road, bev_gt_lane)
                bev_gt_img = mmcv.imresize(
                    bev_seg_gt,
                    (100*scale_fac, 100*scale_fac),
                    interpolation='bilinear')
            try:
                # draw BEV GT
                bev_gt_bboxes = gt_bboxes['gt_bboxes_3d'].corners.numpy()[:,[0,2,6,4]][..., :2]
                labels_gt = gt_bboxes['gt_labels_3d']
                for idx in range(len(labels_gt)):
                    bev_gt_img = self.draw_bev_bbox_corner(bev_gt_img, bev_gt_bboxes[idx], colors[labels_gt[idx]], scale_fac)
            except:
                pass
            bev_gt_img = process_bev_res_in_front(bev_gt_img)
            imsave(os.path.join(out_dir, out_file_dir, 'bev_gt.png'), mmcv.imrescale(bev_gt_img, 0.5))
            all_bev_gt.append(mmcv.imrescale(bev_gt_img, 0.5))
            all_bev_pred.append(mmcv.imrescale(bev_pred_img, 0.5))
            ###### draw BEV pred ######
            ###### draw 3d box in image ######
            img_gt_list = []
            img_pred_list = []

            for j in range(0, self.sensor_count): # we want only the first {sensor_count} frames in the time sequence
                img_pred = imread(info['img_info'][j]['filename'])
                img_gt = imread(info['img_info'][j]['filename'])
                # camera name
                camera_name = info['img_info'][j]['filename'].split('/')[-2]
                puttext(img_pred, camera_name)
                puttext(img_gt, camera_name)
                
                extrinsic = info['lidar2img']['extrinsic'][j]
                intrinsic = info['lidar2img']['intrinsic'][:3, :3]
                projection = intrinsic @ extrinsic[:3]
                if not len(result['scores_3d']):
                    pass
                else:
                    # draw pred
                    corners = result['boxes_3d'].corners.numpy()
                    #print(f"PRED CORNERS SHAPE: {corners.shape}")
                    scores = result['scores_3d'].numpy()
                    labels = result['labels_3d'].numpy()
                    for corner, score, label in zip(corners, scores, labels):
                        if score < thr:
                            continue
                        try:
                            self.draw_corners(img_pred, corner, colors[label], projection)
                        except:
                            pass
                    try:
                        # draw GT
                        corners = gt_bboxes['gt_bboxes_3d'].corners.numpy()
                        #print(f"GT CORNERS SHAPE: {corners.shape}")
                        labels = gt_bboxes['gt_labels_3d']
                        for corner, label in zip(corners, labels):
                            self.draw_corners(img_gt, corner, colors[label], projection)
                    except:
                        pass
                out_file_dir = str(i)
                mmcv.mkdir_or_exist(os.path.join(out_dir, out_file_dir))
                # 缩小image大小 可视化方便一些
                img_gt_pred = np.concatenate([img_gt, img_pred], 0)
                imsave(os.path.join(out_dir, out_file_dir, '{}_gt_pred.png'.format(j)), mmcv.imrescale(img_gt_pred, 0.5))

                img_gt_list.append(mmcv.imrescale(img_gt, 0.5))
                img_pred_list.append(mmcv.imrescale(img_pred, 0.5))
            ###### draw 3d box in image ######
            ###### generate videos step:1 ######
            if make_video:
                assert self.sensor_count == 6, "Video generation is only implemented for the 6-camera setup."
                assert self.sequential == False, "Video generation is not implemented for sequential features."
                tmp_img_up_pred = np.concatenate(sort_list(img_pred_list[0:3], sort=[2,0,1]), axis=1)
                tmp_img_bottom_pred = np.concatenate(sort_list(img_pred_list[3:6], sort=[2,0,1]) ,axis=1)
                tmp_img_pred = np.concatenate([tmp_img_up_pred, tmp_img_bottom_pred], axis=0)
                all_img_pred.append(tmp_img_pred)
                tmp_img_up_gt = np.concatenate(sort_list(img_gt_list[0:3], sort=[2,0,1]),axis=1)
                tmp_img_bottom_gt = np.concatenate(sort_list(img_gt_list[3:6], sort=[2,0,1]),axis=1)
                tmp_img_gt = np.concatenate([tmp_img_up_gt, tmp_img_bottom_gt], axis=0)
                all_img_gt.append(tmp_img_gt)
            ###### generate videos step:1 ######
        if make_video:
            ###### generate videos step:2 ######
            gen_video(all_img_pred, all_bev_pred, out_dir, 'pred', fps=fps)
            gen_video(all_img_gt, all_bev_gt, out_dir, 'gt', fps=fps)
            ###### generate videos step:2 ######

The changes are motivated by the bug I encountered previously while trying to run the visualization for the sequential model.

Apr 18 '23 06:04 vexing-shusher

May be important for the context: I also replaced nms_gpu with nms_rotate from mmcv (I am using slightly different package versions), could this affect the ground truth?

Note: nms_rotate uses xywhr format, I have accounted for that by removing xywhr2xyxyr, so that should not be an issue. Additionally, the mAP for the model looks reasonably close to the reported in the paper, only the visualizations are weird.

Apr 18 '23 06:04 vexing-shusher

Where you able to solve it? My results/gt look very similar and not at all how it should look like

May 24 '24 12:05 Byte247

Where you able to solve it? My results/gt look very similar and not at all how it should look like

Not really, I gave up :) Strongly suspect that there is some kind of bug in the visualization implementation, but I was unable to locate it.

May 24 '24 18:05 vexing-shusher

Fast-BEV Fast-BEV copied to clipboard

Oddly looking ground-truth data during visualization

Fast-BEV
Fast-BEV copied to clipboard