zero_nlp
zero_nlp copied to clipboard
请问在跑Chatglm6b_ModelParallel模型的时候报这个错该怎么解决啊
warnings.warn(
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/zero_nlp/Chatglm6b_ModelParallel/train_model_all.py:321 in <module> │
│ │
│ 318 │ train_dataset=tokenized_datasets["train"], │
│ 319 │ eval_dataset=tokenized_datasets["valid"], │
│ 320 ) │
│ ❱ 321 trainer.train() │
│ 322 │
│ │
│ /home/zero_nlp/Chatglm6b_ModelParallel/MyTrainer.py:1629 in train │
│ │
│ 1626 │ │ inner_training_loop = find_executable_batch_size( │
│ 1627 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1628 │ │ ) │
│ ❱ 1629 │ │ return inner_training_loop( │
│ 1630 │ │ │ args=args, │
│ 1631 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1632 │ │ │ trial=trial, │
│ │
│ /home/zero_nlp/Chatglm6b_ModelParallel/MyTrainer.py:1716 in _inner_training_loop │
│ │
│ 1713 │ │ if args.gradient_checkpointing: │
│ 1714 │ │ │ self.model.gradient_checkpointing_enable() │
│ 1715 │ │ │
│ ❱ 1716 │ │ model = self._wrap_model(self.model_wrapped) │
│ 1717 │ │ │
│ 1718 │ │ if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: │
│ 1719 │ │ │ self._load_from_checkpoint(resume_from_checkpoint, model) │
│ │
│ /home/zero_nlp/Chatglm6b_ModelParallel/MyTrainer.py:1541 in _wrap_model │
│ │
│ 1538 │ │ │ │ kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb │
│ 1539 │ │ │ if is_torch_neuroncore_available(): │
│ 1540 │ │ │ │ return model │
│ ❱ 1541 │ │ │ model = nn.parallel.DistributedDataParallel( │
│ 1542 │ │ │ │ model.cuda(), │
│ 1543 │ │ │ │ device_ids=[self.args.local_rank] if self.args._n_gpu != 0 else None, │
│ 1544 │ │ │ │ output_device=self.args.local_rank if self.args._n_gpu != 0 else None, │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/torch/nn/parallel/distributed.py:625 in __init__ │
│ │
│ 622 │ │ │ self.output_device = _get_device_index(output_device, True) │
│ 623 │ │ │
│ 624 │ │ if process_group is None: │
│ ❱ 625 │ │ │ self.process_group = _get_default_group() │
│ 626 │ │ else: │
│ 627 │ │ │ self.process_group = process_group │
│ 628 │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:707 in │
│ _get_default_group │
│ │
│ 704 │ Getting the default process group created by init_process_group │
│ 705 │ """ │
│ 706 │ if not is_initialized(): │
│ ❱ 707 │ │ raise RuntimeError( │
│ 708 │ │ │ "Default process group has not been initialized, " │
│ 709 │ │ │ "please make sure to call init_process_group." │
│ 710 │ │ ) │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.
楼主解决了吗 我也是这样呢
你是不是修改了参数?我代码里面写的都是模型并行,但是看你的错误,感觉像是数据并行导致的问题。务必仔细看我写的readme.md
和注意事项