CenterPoint-KITTI icon indicating copy to clipboard operation
CenterPoint-KITTI copied to clipboard

multi gpu training result is incorrect , how to correct it ?

Open chenrui17 opened this issue 2 years ago • 1 comments

my trainning result with single gpu is like this image but multi gpu training result is like this image

can you help me see what's wrong when multi gpu mode ?

btw, in order to run the repo code ,i modify some code , like

  1. modify pcdet/datasets/processor/data_process.py function transform_points_to_voxels some spconv api to adapt spconv2.x version
def transform_points_to_voxels(self, data_dict=None, config=None, voxel_generator=None):
        if data_dict is None:
#            try:
#                from spconv.utils import VoxelGeneratorV2 as VoxelGenerator
#            except:
#                from spconv.utils import VoxelGenerator
#
            from spconv.pytorch.utils import PointToVoxel
            voxel_generator = PointToVoxel(
                vsize_xyz=config.VOXEL_SIZE,
                coors_range_xyz=self.point_cloud_range,
                num_point_features=self.num_point_features,
                max_num_voxels=config.MAX_NUMBER_OF_VOXELS[self.mode],
                max_num_points_per_voxel=config.MAX_POINTS_PER_VOXEL
            )
#            voxel_generator = VoxelGenerator(
#                voxel_size=config.VOXEL_SIZE,
#                point_cloud_range=self.point_cloud_range,
#                max_num_points=config.MAX_POINTS_PER_VOXEL,
#                max_voxels=config.MAX_NUMBER_OF_VOXELS[self.mode]
#            )
            grid_size = (self.point_cloud_range[3:6] - self.point_cloud_range[0:3]) / np.array(config.VOXEL_SIZE)
            self.grid_size = np.round(grid_size).astype(np.int64)
            self.voxel_size = config.VOXEL_SIZE
            return partial(self.transform_points_to_voxels, voxel_generator=voxel_generator)

        points = data_dict['points']
        # voxel_output = voxel_generator.generate(points)
        voxel_output = voxel_generator(torch.from_numpy(points))
        if isinstance(voxel_output, dict):
            voxels, coordinates, num_points = \
                voxel_output['voxels'], voxel_output['coordinates'], voxel_output['num_points_per_voxel']
        else:
            voxels, coordinates, num_points = voxel_output

        if not data_dict['use_lead_xyz']:
            voxels = voxels[..., 3:]  # remove xyz in voxels(N, 3)

        data_dict['voxels'] = voxels
        data_dict['voxel_coords'] = coordinates
        data_dict['voxel_num_points'] = num_points
        return data_dict
  1. modify pcdet/models/dense_heads/centerpoint_head_single.py function assign_targets to solve tensor convert problem because numpy version is too high.
def assign_targets(self, gt_boxes):
        """Generate targets.

        Args:
            gt_boxes: (B, M, 8) box + cls 

        Returns:
            Returns:
                tuple[list[torch.Tensor]]: Tuple of target including \
                    the following results in order.

                    - list[torch.Tensor]: Heatmap scores.
                    - list[torch.Tensor]: Ground truth boxes.
                    - list[torch.Tensor]: Indexes indicating the \
                        position of the valid boxes.
                    - list[torch.Tensor]: Masks indicating which \
                        boxes are valid.
        """
        gt_bboxes_3d, gt_labels_3d = gt_boxes[..., :-1], gt_boxes[..., -1]

        heatmaps, anno_boxes, inds, masks = multi_apply(
            self.get_targets_single, gt_bboxes_3d, gt_labels_3d)
        # transpose heatmaps, because the dimension of tensors in each task is
        # different, we have to use numpy instead of torch to do the transpose.
        # heatmaps = np.array(heatmaps).transpose(1, 0).tolist()
        heatmaps = list(map(list, zip(*heatmaps)))
        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
        # transpose anno_boxes
        # anno_boxes = np.array(anno_boxes).transpose(1, 0).tolist()
        anno_boxes = list(map(list, zip(*anno_boxes)))
        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
        # transpose inds
        # inds = np.array(inds).transpose(1, 0).tolist()
        inds = list(map(list, zip(*inds)))
        inds = [torch.stack(inds_) for inds_ in inds]
        # transpose inds
        # masks = np.array(masks).transpose(1, 0).tolist()
        masks = list(map(list, zip(*masks)))
        masks = [torch.stack(masks_) for masks_ in masks]
        
        all_targets_dict = {
            'heatmaps': heatmaps,
            'anno_boxes': anno_boxes,
            'inds': inds,
            'masks': masks
        }
        
        return all_targets_dict

my cmd is single gpu training : python train.py --cfg_file cfgs/kitti_models/centerpoint.yaml multi gpu training : bash scripts/dist_train.sh 8 --cfg_file cfgs/kitti_models/centerpoint.yaml

chenrui17 avatar Nov 14 '22 13:11 chenrui17

请问,有在训练过程中有遇到这个错误吗,我强制转换成float仍然不起作用.我的torch版本是1.1 spconv是1.0 File "/home/neousys/cjg/CenterPoint-KITTI/pcdet/models/dense_heads/centerpoint_head_single.py", line 687, in gaussian_focal_loss pos_loss = (-(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights).float() RuntimeError: expected backend CUDA and dtype Float but got backend CUDA and dtype Byte

evil-master avatar Jul 21 '23 01:07 evil-master