GeneFacePlusPlus icon indicating copy to clipboard operation
GeneFacePlusPlus copied to clipboard

DDP ok?

Open karl-tao-zhang opened this issue 11 months ago • 3 comments

CUDA_VISIBLE_DEVICES=0 ok! but CUDA_VISIBLE_DEVICES=0,1 not ok!

thanks you ,

(geneface) root@autodl-container-a45f4692b6-ed3c630d:~/autodl-tmp/GeneFacePlusPlus# CUDA_VISIBLE_DEVICES=0,1 python tasks/run.py --config=egs/datasets/obama/lm3d_radnerf_torso_sr.yaml --exp_name=motion2video_nerf/obama_torso --hparams=head_model_dir=checkpoints/motion2video_nerf/obama_head --reset

| set_hparams Unknow hparams: [] | Hparams chains: ['egs/egs_bases/radnerf/base.yaml', 'egs/egs_bases/radnerf/lm3d_radnerf.yaml', 'egs/datasets/obama/lm3d_radnerf_torso.yaml', 'egs/datasets/obama/lm3d_radnerf_torso_sr.yaml'] | Hparams: { "accumulate_grad_batches": 1, "add_eye_blink_cond": true, "ambient_coord_dim": 3, "amp": true, "base_config": [ "./lm3d_radnerf_torso.yaml" ], "binary_data_dir": "data/binary/videos", "bound": 1, "camera_offset": [ 0, 0, 0 ], "camera_scale": 4.0, "clip_grad_norm": 0.0, "clip_grad_value": 0, "cond_dropout_rate": 0.0, "cond_out_dim": 64, "cond_type": "idexp_lm3d_normalized", "cond_win_size": 1, "cuda_ray": true, "debug": false, "density_thresh": 10, "density_thresh_torso": 0.01, "desired_resolution": 2048, "dt_gamma": 0.00390625, "eval_max_batches": 100, "exp_name": "motion2video_nerf/obama_torso", "eye_blink_dim": 2, "far": 0.9, "finetune_lips": true, "finetune_lips_start_iter": 200000, "geo_feat_dim": 128, "grid_interpolation_type": "linear", "grid_size": 128, "grid_type": "tiledgrid", "gui_fovy": 21.24, "gui_h": 512, "gui_max_spp": 1, "gui_radius": 3.35, "gui_w": 512, "head_model_dir": "checkpoints/motion2video_nerf/obama_head", "hidden_dim_ambient": 128, "hidden_dim_color": 128, "hidden_dim_sigma": 128, "individual_embedding_dim": 4, "individual_embedding_num": 13000, "infer": false, "infer_audio_source_name": "", "infer_bg_img_fname": "", "infer_c2w_name": "", "infer_cond_name": "", "infer_lm3d_clamp_std": 1.5, "infer_lm3d_lle_percent": 0.25, "infer_lm3d_smooth_sigma": 0.0, "infer_out_video_name": "", "infer_scale_factor": 1.0, "infer_smo_std": 0.0, "infer_smooth_camera_path": true, "infer_smooth_camera_path_kernel_size": 7, "init_method": "tcp", "lambda_ambient": 1.0, "lambda_lap_ambient_loss": 0.0, "lambda_lpips_loss": 0.001, "lambda_torso_deform": 0.0, "lambda_weights_entropy": 0.0001, "load_ckpt": "", "load_imgs_to_memory": false, "log2_hashmap_size": 16, "lpips_mode": "vgg19_v2", "lpips_start_iters": 200000, "lr": 0.0005, "max_ray_batch": 4096, "max_steps": 16, "max_updates": 250000, "min_near": 0.05, "n_rays": 65536, "near": 0.3, "nerf_keypoint_mode": "lm68", "not_save_modules": [ "criterion_lpips", "dual_disc" ], "num_ckpt_keep": 1, "num_layers_ambient": 3, "num_layers_color": 2, "num_layers_sigma": 3, "num_sanity_val_steps": 2, "num_steps": 16, "num_updates": 250000, "num_valid_plots": 5, "optimizer_adam_beta1": 0.9, "optimizer_adam_beta2": 0.999, "polygon_face_mask": true, "print_nan_grads": false, "processed_data_dir": "data/processed/videos", "raw_data_dir": "data/raw/videos", "resume_from_checkpoint": 0, "save_best": true, "save_codes": [ "tasks", "modules", "egs" ], "save_gt": true, "scheduler": "exponential", "seed": 9999, "smo_win_size": 3, "smooth_lips": false, "sr_start_iters": 0, "start_rank": 0, "task_cls": "tasks.radnerfs.radnerf_torso_sr.RADNeRFTorsoTask", "tb_log_interval": 100, "torso_head_aware": true, "torso_individual_embedding_dim": 8, "torso_shrink": 0.8, "torso_train_mode": 1, "update_extra_interval": 16, "upsample_steps": 0, "use_window_cond": true, "val_check_interval": 2000, "valid_infer_interval": 10000, "valid_monitor_key": "val_loss", "valid_monitor_mode": "min", "validate": false, "video_id": "obama", "warmup_updates": 0, "weight_decay": 0, "with_att": true, "with_sr": true, "work_dir": "checkpoints/motion2video_nerf/obama_torso", "world_size": -1, "zero_dummy": true } 03/20 10:30:44 AM GPU available: True, GPU used: [0, 1], world_size: 2, multi-machine training: False before init_tcp! before init_tcp! 03/20 10:30:47 AM Added key: store_based_barrier_key:1 to store for rank: 1 03/20 10:30:47 AM Added key: store_based_barrier_key:1 to store for rank: 0 03/20 10:30:47 AM Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 03/20 10:30:47 AM Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. after init_tcp! /root/autodl-tmp/GeneFacePlusPlus/tasks/radnerfs/dataset_utils.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). self.lm68s = torch.tensor(self.lm2ds[:, index_lm68_from_lm478, :]) val: Smooth head trajectory (rotation and translation) with a window size of 7 | Copied codes to checkpoints/motion2video_nerf/obama_torso/codes/20240320103056. | load 'model' from 'checkpoints/motion2video_nerf/obama_head/model_ckpt_steps_250000.ckpt', strict=True Loaded Head Model from checkpoints/motion2video_nerf/obama_head Loaded state_dict of Head Model to the RADNeRFTorso Model | cond_prenet Trainable Parameters: 0.000M | blink_embedding Trainable Parameters: 0.000M | blink_encoder Trainable Parameters: 0.000M | cond_att_net Trainable Parameters: 0.000M | position_embedder Trainable Parameters: 0.000M | ambient_net Trainable Parameters: 0.000M | ambient_embedder Trainable Parameters: 0.000M | sigma_net Trainable Parameters: 0.000M | direction_embedder Trainable Parameters: 0.000M | color_net Trainable Parameters: 0.000M | dropout Trainable Parameters: 0.000M | lm68_embedder Trainable Parameters: 0.000M | torso_pose_embedder Trainable Parameters: 0.000M | torso_deform_pos_embedder Trainable Parameters: 0.000M | torso_embedder Trainable Parameters: 1.111M | head_color_weights_encoder Trainable Parameters: 0.000M | torso_deform_net Trainable Parameters: 0.017M | torso_canonicial_net Trainable Parameters: 0.008M | sr_net Trainable Parameters: 0.271M Sanity Val: 0%| | 0/2 [00:00<?, ?step/s]/root/autodl-tmp/GeneFacePlusPlus/tasks/radnerfs/dataset_utils.py:427: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). sample['lm68'] = torch.tensor(self.lm68s[idx].reshape([68*2])) Sanity Val: 0%| | 0/2 [00:00<?, ?step/s] Traceback (most recent call last): File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/trainer.py", line 236, in run_single_process self.train() File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/trainer.py", line 314, in train self.evaluate(self.task, False, 'Sanity Val', max_batches=self.num_sanity_val_steps) File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/trainer.py", line 288, in evaluate output = task(*args) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/ddp_utils.py", line 82, in forward inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1223, in scatter return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 52, in scatter_kwargs inputs = scatter(inputs, target_gpus, dim) if inputs else [] File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 44, in scatter res = scatter_map(inputs) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 31, in scatter_map return list(zip(*map(scatter_map, obj))) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 35, in scatter_map return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))] File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 31, in scatter_map return list(zip(*map(scatter_map, obj))) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 27, in scatter_map return Scatter.apply(target_gpus, None, dim, obj) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply return super().apply(*args, **kwargs) # type: ignore[misc] File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/_functions.py", line 96, in forward outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 189, in scatter return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams)) RuntimeError: chunk expects at least a 1-dimensional tensor Traceback (most recent call last): File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/trainer.py", line 141, in fit mp.start_processes(self.ddp_run,nprocs=self.num_local_gpus, args=(task_cls, copy.deepcopy(hparams)), start_method='spawn') File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes while not context.join(): File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join raise ProcessExitedException( torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGTERM Traceback (most recent call last): File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/trainer.py", line 141, in fit mp.start_processes(self.ddp_run,nprocs=self.num_local_gpus, args=(task_cls, copy.deepcopy(hparams)), start_method='spawn') File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes while not context.join(): File "/root/miniconda3/envs/geneface/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join raise ProcessExitedException( torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGTERM

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/root/autodl-tmp/GeneFacePlusPlus/tasks/run.py", line 28, in run_task() File "/root/autodl-tmp/GeneFacePlusPlus/tasks/run.py", line 16, in run_task task_cls.start() File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/base_task.py", line 272, in start trainer.fit(cls) File "/root/autodl-tmp/GeneFacePlusPlus/utils/commons/trainer.py", line 154, in fit subprocess.check_call(f'pkill -f "GeneFace_worker ({hparams["work_dir"]}"', shell=True) File "/root/miniconda3/envs/geneface/lib/python3.9/subprocess.py", line 373, in check_call raise CalledProcessError(retcode, cmd) subprocess.CalledProcessError: Command 'pkill -f "GeneFace_worker (checkpoints/motion2video_nerf/obama_torso"' returned non-zero exit status 1.

karl-tao-zhang avatar Mar 20 '24 02:03 karl-tao-zhang