nerfstudio
nerfstudio copied to clipboard
Try to utilize VanillaDataManager as a component of my work and Parallel training in multi-gpu
I want to utilize the VanillaDataManager
(nerfstudio-0.2.2) as a component of my work. Now single gpu training works well, and im try to make it in multi-gpu for parallel training.
However, im not sure whether i've done it in a right way. Here is a part of my code below.
"""
a training pipeline of my own - pipeline.py
"""
class MyPipeline(nn.Module):
def __init__(
self,
device: str,
test_mode: Literal["test", "val", "inference"] = "val",
world_size: int = 1,
local_rank: int = 0,
**kwargs
):
# build a bunch of log dir for experiment...
# VanillaDataManager of nerfstudio
datamanager_config = VanillaDataManagerConfig(
# _target=RayPruningDataManager,
dataparser=MinimalDataParserConfig(data=Path(self.instance_dir)),
eval_num_rays_per_batch=self.batch_size,
train_num_rays_per_batch=self.batch_size,
)
self.datamanager = VanillaDataManager(config=datamanager_config,
device=device,
test_mode=test_mode,
world_size=world_size,
local_rank=local_rank)
self.datamanager.to(device)
assert self.datamanager.train_dataset is not None, "Missing input dataset"
# initialize my nerf model
self.model = MyNerfModel(config=model_config,
transform=self.datamanager.train_dataparser_outputs.dataparser_transform,
scale=self.datamanager.train_dataparser_outputs.dataparser_scale,
device=device
)
self.model.to(device)
# DistributedDataParallel
self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.GPU_INDEX], broadcast_buffers=False, find_unused_parameters=True)
def train(self) -> None:
"""
training pipeline
"""
# ...
ray_bundle, batch = self.datamanager.next_train(step)
model_outputs = self.model(ray_bundle)
metrics_dict = self.model.module.get_metrics_dict(model_outputs, batch)
loss_dict = self.model.module.get_loss_dict(model_outputs, batch, metrics_dict)
# ...
"""
Entrance to the training program - train.py
"""
def _set_random_seed(seed) -> None:
"""Set randomness seed in torch and numpy"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if __name__ == '__main__':
# conf = '/root/home/workspace/tetra-nerf_modified/tetra_sdf/test.conf'
# runner = TetraSDFTrainRunner(conf=conf)
parser = argparse.ArgumentParser()
parser.add_argument() # omit here...
opt = parser.parse_args()
"""
if opt.gpu == "auto":
deviceIDs = GPUtil.getAvailable(order='memory', limit=1, maxLoad=0.5, maxMemory=0.5, includeNan=False,
excludeID=[], excludeUUID=[])
if len(deviceIDs) == 0:
raise RuntimeError("No GPU available")
gpu = deviceIDs[0]
else:
gpu = opt.gpu
"""
gpu = opt.local_rank
# set distributed training
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
rank = int(os.environ["RANK"])
world_size = int(os.environ['WORLD_SIZE'])
print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}")
else:
rank = -1
world_size = -1
print(opt.local_rank)
torch.cuda.set_device(opt.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank, timeout=timedelta(1, 1800))
torch.distributed.barrier()
test_mode = "val"
device: TORCH_DEVICE = "cpu" if world_size == 0 else f"cuda:{opt.local_rank}"
_set_random_seed(42 + rank)
trainrunner = MyPipeline(device=device,
test_mode=test_mode,
world_size=world_size,
local_rank=opt.local_rank,
# opt.stuff...
)
trainrunner.train()
The cmd to execute is CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node 4 --nnodes=1 --node_rank=0 train.py
By the way, i notice that class DataManager
has attributs train_sampler
and eval_sampler
, but never used. Are they not useful for multi-GPU parallel training? Anyone can help? Waiting online