nerfstudio icon indicating copy to clipboard operation
nerfstudio copied to clipboard

Try to utilize VanillaDataManager as a component of my work and Parallel training in multi-gpu

Open liuxiaozhu01 opened this issue 1 year ago • 6 comments

I want to utilize the VanillaDataManager(nerfstudio-0.2.2) as a component of my work. Now single gpu training works well, and im try to make it in multi-gpu for parallel training. However, im not sure whether i've done it in a right way. Here is a part of my code below.

"""
a training pipeline of my own - pipeline.py
"""
class MyPipeline(nn.Module):
    def __init__(
        self,
        device: str,
        test_mode: Literal["test", "val", "inference"] = "val",
        world_size: int = 1,
        local_rank: int = 0,
        **kwargs
    ):
        
        # build a bunch of log dir for experiment...
        
        # VanillaDataManager of nerfstudio
        datamanager_config = VanillaDataManagerConfig(
                                                    # _target=RayPruningDataManager,
                                                    dataparser=MinimalDataParserConfig(data=Path(self.instance_dir)),
                                                    eval_num_rays_per_batch=self.batch_size,
                                                    train_num_rays_per_batch=self.batch_size,
                                                )
        
        self.datamanager = VanillaDataManager(config=datamanager_config,
                                              device=device,
                                              test_mode=test_mode,
                                              world_size=world_size,
                                              local_rank=local_rank)
        self.datamanager.to(device)
        
        assert self.datamanager.train_dataset is not None, "Missing input dataset"
        
        # initialize my nerf model
        self.model = MyNerfModel(config=model_config,
                              transform=self.datamanager.train_dataparser_outputs.dataparser_transform,
                              scale=self.datamanager.train_dataparser_outputs.dataparser_scale,
                              device=device
                              )
        
        self.model.to(device)
        # DistributedDataParallel
        self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.GPU_INDEX], broadcast_buffers=False, find_unused_parameters=True)
        
    def train(self) -> None:
        """
        training pipeline
        """
        # ...
        ray_bundle, batch = self.datamanager.next_train(step)
        model_outputs = self.model(ray_bundle)
        metrics_dict = self.model.module.get_metrics_dict(model_outputs, batch)
        
        loss_dict = self.model.module.get_loss_dict(model_outputs, batch, metrics_dict)
        # ...
"""
Entrance to the training program - train.py
"""
def _set_random_seed(seed) -> None:
    """Set randomness seed in torch and numpy"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


if __name__ == '__main__':
    # conf = '/root/home/workspace/tetra-nerf_modified/tetra_sdf/test.conf'
    # runner = TetraSDFTrainRunner(conf=conf)
    
    parser = argparse.ArgumentParser()
    parser.add_argument()   # omit here... 

    opt = parser.parse_args()
    """
    if opt.gpu == "auto":
        deviceIDs = GPUtil.getAvailable(order='memory', limit=1, maxLoad=0.5, maxMemory=0.5, includeNan=False,
                                        excludeID=[], excludeUUID=[])
        if len(deviceIDs) == 0:
            raise RuntimeError("No GPU available")
        gpu = deviceIDs[0]
    else:
        gpu = opt.gpu
    """
    gpu = opt.local_rank

    # set distributed training
    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        rank = int(os.environ["RANK"])
        world_size = int(os.environ['WORLD_SIZE'])
        print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}")
    else:
        rank = -1
        world_size = -1

    print(opt.local_rank)
    torch.cuda.set_device(opt.local_rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank, timeout=timedelta(1, 1800))
    torch.distributed.barrier()
    test_mode = "val"
    device: TORCH_DEVICE = "cpu" if world_size == 0 else f"cuda:{opt.local_rank}"
    
    _set_random_seed(42 + rank)
    trainrunner = MyPipeline(device=device,
                                   test_mode=test_mode,
                                   world_size=world_size,
                                   local_rank=opt.local_rank,
                                    # opt.stuff...
                                    )
    
    trainrunner.train()

The cmd to execute is CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node 4 --nnodes=1 --node_rank=0 train.py

By the way, i notice that class DataManager has attributs train_sampler and eval_sampler, but never used. Are they not useful for multi-GPU parallel training? Anyone can help? Waiting online

liuxiaozhu01 avatar Oct 10 '23 13:10 liuxiaozhu01