3D-RETR icon indicating copy to clipboard operation
3D-RETR copied to clipboard

The problems about train.py

Open Yxs-160 opened this issue 2 years ago • 0 comments

https://github.com/FomalhautB/3D-RETR/blob/a359ec453a930cf507aab442fac57b7bae3029ce/train.py#L76 When i ran the raw codes, I have met a problem

python train.py --model_path SHAPENET_VOXEL --image_path SHAPENET_IMAGE --annot_path data/ShapeNet.json --transformer_config config/3d-retr-s.yaml --gpus 2 { 'accelerator': None, 'accumulate_grad_batches': 1, 'amp_backend': 'native', 'amp_level': 'O2', 'annot_path': 'data/ShapeNet.json', 'auto_lr_find': False, 'auto_scale_batch_size': False, 'auto_select_gpus': False, 'background': (0, 0, 0), 'benchmark': False, 'check_val_every_n_epoch': 1, 'checkpoint_callback': True, 'continue_from': None, 'data_aug': False, 'default_root_dir': None, 'deterministic': False, 'distributed_backend': None, 'experiment_name': '3D-RETR', 'fast_dev_run': False, 'flush_logs_every_n_steps': 100, 'gpus': 2, 'gradient_clip_algorithm': 'norm', 'gradient_clip_val': 0.0, 'image_path': 'SHAPENET_IMAGE', 'limit_predict_batches': 1.0, 'limit_test_batches': 1.0, 'limit_train_batches': 1.0, 'limit_val_batches': 1.0, 'log_every_n_steps': 50, 'log_gpu_memory': None, 'logger': True, 'loss_type': 'dice', 'lr': 0.0001, 'max_epochs': None, 'max_steps': None, 'max_time': None, 'min_epochs': None, 'min_steps': None, 'model_path': 'SHAPENET_VOXEL', 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'num_nodes': 1, 'num_processes': 1, 'num_sanity_val_steps': 2, 'num_workers': 8, 'overfit_batches': 0.0, 'plugins': None, 'precision': 32, 'prepare_data_per_node': True, 'process_position': 0, 'profiler': None, 'progress_bar_refresh_rate': None, 'reload_dataloaders_every_epoch': False, 'replace_sampler_ddp': True, 'resume_from_checkpoint': None, 'sample_batch_num': 0, 'sched_factor': 1, 'seed': 0, 'stochastic_weight_avg': False, 'sync_batchnorm': False, 'terminate_on_nan': False, 'threshold': 0.5, 'tpu_cores': None, 'track_grad_norm': -1, 'train_batch_size': 8, 'transformer_config': 'config/3d-retr-s.yaml', 'truncated_bptt_steps': None, 'val_batch_size': 8, 'val_check_interval': 1.0, 'view_num': 1, 'weights_save_path': None, 'weights_summary': 'top'} Global seed set to 0 { 'decoder_depth': 6, 'decoder_dim': 192, 'decoder_dropout': 0.4, 'decoder_heads': 3, 'decoder_model': 'cnn', 'encoder_dropout': 0.4, 'encoder_model': 'vit_deit_tiny_distilled_patch16_224', 'num_cnn_layers': 3, 'num_resnet_blocks': 2, 'patch_num': 4, 'voxel_size': 32} Ignored parameter "head.weight" on loading Ignored parameter "head.bias" on loading Ignored parameter "head_dist.weight" on loading Ignored parameter "head_dist.bias" on loading /home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: You requested multiple GPUs but did not specify a backend, e.g. Trainer(accelerator="dp"|"ddp"|"ddp2"). Setting accelerator="ddp_spawn" for you. warnings.warn(*args, **kwargs) GPU available: True, used: True TPU available: False, using: 0 TPU cores LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1] Traceback (most recent call last): File "train.py", line 155, in trainer.fit(model, train_loader, val_loader) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit self._run(model) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run self.dispatch() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch self.accelerator.start_training(self) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training self.training_type_plugin.start_training(trainer) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 122, in start_training mp.spawn(self.new_process, **self.mp_spawn_kwargs) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 179, in start_processes process.start() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/process.py", line 105, in start self._popen = self._Popen(self) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/context.py", line 284, in _Popen return Popen(process_obj) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 32, in init super().init(process_obj) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/popen_fork.py", line 19, in init self._launch(process_obj) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47, in _launch reduction.dump(process_obj, fp) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/reduction.py", line 60, in dump ForkingPickler(file, protocol).dump(obj) _pickle.PicklingError: Can't pickle <function at 0x7f5b76c48510>: attribute lookup on main failed

I try to solve it by https://stackoverflow.com/a/25353243/1791279 and to change this line of code to pickle.dumps(lambda x: x[0]), but still can not run the codes successfully, another error occured :

python train.py --model_path SHAPENET_VOXEL --image_path SHAPENET_IMAGE --annot_path data/ShapeNet.json --transformer_config config/3d-retr-s.yaml --gpus 2 { 'accelerator': None, 'accumulate_grad_batches': 1, 'amp_backend': 'native', 'amp_level': 'O2', 'annot_path': 'data/ShapeNet.json', 'auto_lr_find': False, 'auto_scale_batch_size': False, 'auto_select_gpus': False, 'background': (0, 0, 0), 'benchmark': False, 'check_val_every_n_epoch': 1, 'checkpoint_callback': True, 'continue_from': None, 'data_aug': False, 'default_root_dir': None, 'deterministic': False, 'distributed_backend': None, 'experiment_name': '3D-RETR', 'fast_dev_run': False, 'flush_logs_every_n_steps': 100, 'gpus': 2, 'gradient_clip_algorithm': 'norm', 'gradient_clip_val': 0.0, 'image_path': 'SHAPENET_IMAGE', 'limit_predict_batches': 1.0, 'limit_test_batches': 1.0, 'limit_train_batches': 1.0, 'limit_val_batches': 1.0, 'log_every_n_steps': 50, 'log_gpu_memory': None, 'logger': True, 'loss_type': 'dice', 'lr': 0.0001, 'max_epochs': None, 'max_steps': None, 'max_time': None, 'min_epochs': None, 'min_steps': None, 'model_path': 'SHAPENET_VOXEL', 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'num_nodes': 1, 'num_processes': 1, 'num_sanity_val_steps': 2, 'num_workers': 8, 'overfit_batches': 0.0, 'plugins': None, 'precision': 32, 'prepare_data_per_node': True, 'process_position': 0, 'profiler': None, 'progress_bar_refresh_rate': None, 'reload_dataloaders_every_epoch': False, 'replace_sampler_ddp': True, 'resume_from_checkpoint': None, 'sample_batch_num': 0, 'sched_factor': 1, 'seed': 0, 'stochastic_weight_avg': False, 'sync_batchnorm': False, 'terminate_on_nan': False, 'threshold': 0.5, 'tpu_cores': None, 'track_grad_norm': -1, 'train_batch_size': 8, 'transformer_config': 'config/3d-retr-s.yaml', 'truncated_bptt_steps': None, 'val_batch_size': 8, 'val_check_interval': 1.0, 'view_num': 1, 'weights_save_path': None, 'weights_summary': 'top'} Global seed set to 0 { 'decoder_depth': 6, 'decoder_dim': 192, 'decoder_dropout': 0.4, 'decoder_heads': 3, 'decoder_model': 'cnn', 'encoder_dropout': 0.4, 'encoder_model': 'vit_deit_tiny_distilled_patch16_224', 'num_cnn_layers': 3, 'num_resnet_blocks': 2, 'patch_num': 4, 'voxel_size': 32} Ignored parameter "head.weight" on loading Ignored parameter "head.bias" on loading Ignored parameter "head_dist.weight" on loading Ignored parameter "head_dist.bias" on loading /home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: You requested multiple GPUs but did not specify a backend, e.g. Trainer(accelerator="dp"|"ddp"|"ddp2"). Setting accelerator="ddp_spawn" for you. warnings.warn(*args, **kwargs) GPU available: True, used: True TPU available: False, using: 0 TPU cores LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1] Global seed set to 0 initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/2 Global seed set to 0 initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/2

| Name | Type | Params

0 | encoder | VisionTransformerEncoder | 5.5 M 1 | decoder | VoxelDecoderCNN | 4.9 M

10.4 M Trainable params 0 Non-trainable params 10.4 M Total params 41.761 Total estimated model params size (MB) Validation sanity check: 0it [00:00, ?it/s]/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: num_workers>0, persistent_workers=False, and accelerator=ddp_spawn may result in data loading bottlenecks. Consider setting persistent_workers=True (this is a limitation of Python .spawn() and PyTorch) warnings.warn(*args, **kwargs) Validation sanity check: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last): File "train.py", line 155, in trainer.fit(model, train_loader, val_loader) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit self._run(model) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run self.dispatch() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch self.accelerator.start_training(self) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training self.training_type_plugin.start_training(trainer) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 122, in start_training mp.spawn(self.new_process, **self.mp_spawn_kwargs) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes while not context.join(): File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join raise ProcessRaisedException(msg, error_index, failed_process.pid) torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 0 terminated with the following error: Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap fn(i, *args) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 172, in new_process results = trainer.run_stage() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage return self.run_train() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 842, in run_train self.run_sanity_check(self.lightning_module) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1107, in run_sanity_check self.run_evaluation() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 949, in run_evaluation for batch_idx, batch in enumerate(dataloader): File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 517, in next data = self._next_data() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1199, in _next_data return self._process_data(data) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1225, in _process_data data.reraise() File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise raise self.exc_type(msg) TypeError: Caught TypeError in DataLoader worker process 0. Original Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop data = fetcher.fetch(index) File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in data = [self.dataset[idx] for idx in possibly_batched_index] File "/home/ubuntu/YaoXinSheng/3D-RETR-main/src/data/datasets.py", line 25, in getitem return self._dataset[self._indices[index].item()] File "/home/ubuntu/YaoXinSheng/3D-RETR-main/src/data/datasets.py", line 109, in getitem image = self._image_transforms(image) File "/home/ubuntu/YaoXinSheng/3D-RETR-main/src/data/transforms.py", line 32, in call rendering_images = t(rendering_images) TypeError: 'bytes' object is not callable

And I do not know how to solve it, could you help me?

Yxs-160 avatar Mar 28 '22 06:03 Yxs-160