MotionGPT
MotionGPT copied to clipboard
[Training] Stops with an error : The algorithm failed to converge because the input matrix contained non-finite values.
Running python -m train --cfg configs/config_h3d_stage1.yaml --nodebug after setting up the database proceeds training for 9 epochs and runs into the below error.
1 Loading HumanML3D train ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.7/3.7 GB 0:00:00 2 [?25hPointer Pointing at 0 3 Loading HumanML3D test ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 702.9/702.9 MB 0:00:00 4 [?25hPointer Pointing at 0 5 ┏━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┓ 6 ┃ ┃ Name ┃ Type ┃ Params ┃ 7 ┡━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━┩ 8 │ 0 │ metrics │ BaseMetrics │ 65.1 M │ 9 │ 1 │ vae │ VQVae │ 19.4 M │ 10 │ 2 │ lm │ MLM │ 248 M │ 11 │ 3 │ _losses │ ModuleDict │ 0 │ 12 └───┴─────────┴─────────────┴────────┘ 13 Trainable params: 267 M 14 Non-trainable params: 65.1 M 15 Total params: 332 M 16 Total estimated model params size (MB): 1.3 K 17 LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] 18 2024-01-07 16:06:24,603 Sanity checking ok. 19 Epoch 9/999998 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 88/88 0:00:12 • 0:00:00 7.37it/s
20 2024-01-07 16:06:25,531 Training started 21 2024-01-07 16:06:37,118 Epoch 0: 22 2024-01-07 16:06:50,415 Epoch 1: loss_total 8.192e-01 23 2024-01-07 16:07:02,251 Epoch 2: loss_total 6.196e-01 24 2024-01-07 16:07:14,052 Epoch 3: loss_total 5.443e-01 25 2024-01-07 16:07:25,955 Epoch 4: loss_total 4.995e-01 26 2024-01-07 16:07:37,948 Epoch 5: loss_total 4.704e-01 27 2024-01-07 16:07:50,044 Epoch 6: loss_total 4.477e-01 28 2024-01-07 16:08:02,174 Epoch 7: loss_total 4.288e-01 29 2024-01-07 16:08:14,288 Epoch 8: loss_total 4.166e-01 30 Validation ━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 29/146 0:00:04 • 0:00:20 6.10it/s 31 [?25h 32 Traceback (most recent call last): 33 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/runpy.py", line 196, in _run_module_as_main 34 return _run_code(code, main_globals, None, 35 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/runpy.py", line 86, in _run_code 36 exec(code, run_globals) 37 File "/home/yasha/workspace/mocap/MotionGPT/train.py", line 94, in38 main() 39 File "/home/yasha/workspace/mocap/MotionGPT/train.py", line 85, in main 40 trainer.fit(model, datamodule=datamodule) 41 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit 42 call._call_and_handle_interrupt( 43 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt 44 return trainer_fn(*args, **kwargs) 45 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl 46 self._run(model, ckpt_path=ckpt_path) 47 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 989, in _run 48 results = self._run_stage() 49 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run_stage 50 self.fit_loop.run() 51 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 202, in run 52 self.advance() 53 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 359, in advance 54 self.epoch_loop.run(self._data_fetcher) 55 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 137, in run 56 self.on_advance_end(data_fetcher) 57 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 285, in on_advance_end 58 self.val_loop.run() 59 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 182, in _decorator 60 return loop_run(self, *args, **kwargs) 61 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 134, in run 62 self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) 63 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 391, in _evaluation_step 64 output = call._call_strategy_hook(trainer, hook_name, *step_args) 65 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 309, in _call_strategy_hook 66 output = fn(*args, **kwargs) 67 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 403, in validation_step 68 return self.lightning_module.validation_step(*args, **kwargs) 69 File "/home/yasha/workspace/mocap/MotionGPT/mGPT/models/base.py", line 28, in validation_step 70 return self.allsplit_step("val", batch, batch_idx) 71 File "/home/yasha/workspace/mocap/MotionGPT/mGPT/models/mgpt.py", line 454, in allsplit_step 72 metric).update(rs_set["joints_rst"], 73 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/torchmetrics/metric.py", line 470, in wrapped_func 74 raise err 75 File "/home/yasha/miniconda3/envs/motiongpt_env/lib/python3.10/site-packages/torchmetrics/metric.py", line 460, in wrapped_func 76 update(*args, **kwargs) 77 File "/home/yasha/workspace/mocap/MotionGPT/mGPT/metrics/mr.py", line 96, in update 78 self.PAMPJPE += torch.sum(calc_pampjpe(rst[i], ref[i])) 79 File "/home/yasha/workspace/mocap/MotionGPT/mGPT/metrics/utils.py", line 397, in calc_pampjpe 80 preds_tranformed, PA_transform = batch_compute_similarity_transform_torch( 81 File "/home/yasha/workspace/mocap/MotionGPT/mGPT/metrics/utils.py", line 296, in batch_compute_similarity_transform_torch 82 U, s, V = torch.svd(K) 83 torch._C._LinAlgError: linalg.svd: (Batch element 0): The algorithm failed to converge because the input matrix contained non-finite values.
How do we fix this ?