CenterPoint-KITTI
CenterPoint-KITTI copied to clipboard
multi gpu training result is incorrect , how to correct it ?
my trainning result with single gpu is like this
but multi gpu training result is like this
can you help me see what's wrong when multi gpu mode ?
btw, in order to run the repo code ,i modify some code , like
- modify pcdet/datasets/processor/data_process.py function
transform_points_to_voxels
some spconv api to adapt spconv2.x version
def transform_points_to_voxels(self, data_dict=None, config=None, voxel_generator=None):
if data_dict is None:
# try:
# from spconv.utils import VoxelGeneratorV2 as VoxelGenerator
# except:
# from spconv.utils import VoxelGenerator
#
from spconv.pytorch.utils import PointToVoxel
voxel_generator = PointToVoxel(
vsize_xyz=config.VOXEL_SIZE,
coors_range_xyz=self.point_cloud_range,
num_point_features=self.num_point_features,
max_num_voxels=config.MAX_NUMBER_OF_VOXELS[self.mode],
max_num_points_per_voxel=config.MAX_POINTS_PER_VOXEL
)
# voxel_generator = VoxelGenerator(
# voxel_size=config.VOXEL_SIZE,
# point_cloud_range=self.point_cloud_range,
# max_num_points=config.MAX_POINTS_PER_VOXEL,
# max_voxels=config.MAX_NUMBER_OF_VOXELS[self.mode]
# )
grid_size = (self.point_cloud_range[3:6] - self.point_cloud_range[0:3]) / np.array(config.VOXEL_SIZE)
self.grid_size = np.round(grid_size).astype(np.int64)
self.voxel_size = config.VOXEL_SIZE
return partial(self.transform_points_to_voxels, voxel_generator=voxel_generator)
points = data_dict['points']
# voxel_output = voxel_generator.generate(points)
voxel_output = voxel_generator(torch.from_numpy(points))
if isinstance(voxel_output, dict):
voxels, coordinates, num_points = \
voxel_output['voxels'], voxel_output['coordinates'], voxel_output['num_points_per_voxel']
else:
voxels, coordinates, num_points = voxel_output
if not data_dict['use_lead_xyz']:
voxels = voxels[..., 3:] # remove xyz in voxels(N, 3)
data_dict['voxels'] = voxels
data_dict['voxel_coords'] = coordinates
data_dict['voxel_num_points'] = num_points
return data_dict
- modify pcdet/models/dense_heads/centerpoint_head_single.py function
assign_targets
to solve tensor convert problem because numpy version is too high.
def assign_targets(self, gt_boxes):
"""Generate targets.
Args:
gt_boxes: (B, M, 8) box + cls
Returns:
Returns:
tuple[list[torch.Tensor]]: Tuple of target including \
the following results in order.
- list[torch.Tensor]: Heatmap scores.
- list[torch.Tensor]: Ground truth boxes.
- list[torch.Tensor]: Indexes indicating the \
position of the valid boxes.
- list[torch.Tensor]: Masks indicating which \
boxes are valid.
"""
gt_bboxes_3d, gt_labels_3d = gt_boxes[..., :-1], gt_boxes[..., -1]
heatmaps, anno_boxes, inds, masks = multi_apply(
self.get_targets_single, gt_bboxes_3d, gt_labels_3d)
# transpose heatmaps, because the dimension of tensors in each task is
# different, we have to use numpy instead of torch to do the transpose.
# heatmaps = np.array(heatmaps).transpose(1, 0).tolist()
heatmaps = list(map(list, zip(*heatmaps)))
heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
# transpose anno_boxes
# anno_boxes = np.array(anno_boxes).transpose(1, 0).tolist()
anno_boxes = list(map(list, zip(*anno_boxes)))
anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
# transpose inds
# inds = np.array(inds).transpose(1, 0).tolist()
inds = list(map(list, zip(*inds)))
inds = [torch.stack(inds_) for inds_ in inds]
# transpose inds
# masks = np.array(masks).transpose(1, 0).tolist()
masks = list(map(list, zip(*masks)))
masks = [torch.stack(masks_) for masks_ in masks]
all_targets_dict = {
'heatmaps': heatmaps,
'anno_boxes': anno_boxes,
'inds': inds,
'masks': masks
}
return all_targets_dict
my cmd is single gpu training : python train.py --cfg_file cfgs/kitti_models/centerpoint.yaml multi gpu training : bash scripts/dist_train.sh 8 --cfg_file cfgs/kitti_models/centerpoint.yaml
请问,有在训练过程中有遇到这个错误吗,我强制转换成float仍然不起作用.我的torch版本是1.1 spconv是1.0 File "/home/neousys/cjg/CenterPoint-KITTI/pcdet/models/dense_heads/centerpoint_head_single.py", line 687, in gaussian_focal_loss pos_loss = (-(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights).float() RuntimeError: expected backend CUDA and dtype Float but got backend CUDA and dtype Byte