xtuner
xtuner copied to clipboard
执行 NPROC_PER_NODE=2 xtuner train /root/StableDiffusionGPT/config/internlm2_1_8b_qlora_alpaca_e3_copy.py --work-dir /root/test/ft/train --deepspeed deepspeed_zero2 指令运行报错
error log: Generating train split: 3457 examples [00:00, 14292.20 examples/s] Map (num_proc=32): 0%| | 0/3457 [00:00<?, ? examples/s] [rank0]: multiprocess.pool.RemoteTraceback: [rank0]: """ [rank0]: Traceback (most recent call last): [rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/multiprocess/pool.py", line 125, in worker [rank0]: result = (True, func(*args, **kwds)) [rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 623, in _write_generator_to_queue [rank0]: for i, result in enumerate(func(**kwargs)): [rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3458, in _map_single [rank0]: example = apply_function_on_filtered_inputs(example, i, offset=offset) [rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3361, in apply_function_on_filtered_inputs [rank0]: processed_inputs = function(*fn_args, *additional_args, **fn_kwargs) [rank0]: File "/root/test/xtuner/xtuner/dataset/map_fns/dataset_map_fns/openai_map_fn.py", line 22, in openai_map_fn [rank0]: messages = example['messages'] [rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 270, in getitem [rank0]: value = self.data[key] [rank0]: KeyError: 'messages' [rank0]: """
[rank0]: The above exception was the direct cause of the following exception:
[rank0]: Traceback (most recent call last):
[rank0]: File "/root/test/xtuner/xtuner/tools/train.py", line 360, in
[rank0]: main()
[rank0]: File "/root/test/xtuner/xtuner/tools/train.py", line 356, in main
[rank0]: runner.train()
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank0]: self._train_loop = self.build_train_loop(
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank0]: loop = LOOPS.build(
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]: return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]: obj = obj_cls(**args) # type: ignore
[rank0]: File "/root/test/xtuner/xtuner/engine/runner/loops.py", line 32, in init
[rank0]: dataloader = runner.build_dataloader(
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank0]: dataset = DATASETS.build(dataset_cfg)
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]: return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]: obj = obj_cls(**args) # type: ignore
[rank0]: File "/root/test/xtuner/xtuner/dataset/huggingface.py", line 308, in process_hf_dataset
[rank0]: dataset = process(**kwargs)
[rank0]: File "/root/test/xtuner/xtuner/dataset/huggingface.py", line 179, in process
[rank0]: dataset = map_dataset(dataset, dataset_map_fn, map_num_proc)
[rank0]: File "/root/test/xtuner/xtuner/dataset/huggingface.py", line 50, in map_dataset
[rank0]: dataset = dataset.map(dataset_map_fn, num_proc=map_num_proc)
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 593, in wrapper
[rank0]: out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 558, in wrapper
[rank0]: out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3197, in map
[rank0]: for rank, done, content in iflatmap_unordered(
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 663, in iflatmap_unordered
[rank0]: [async_result.get(timeout=0.05) for async_result in async_results]
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 663, in
[rank0]: [async_result.get(timeout=0.05) for async_result in async_results]
[rank0]: File "/root/.conda/envs/test/lib/python3.10/site-packages/multiprocess/pool.py", line 774, in get
[rank0]: raise self._value
[rank0]: KeyError: 'messages'
[rank1]:[E ProcessGroupGloo.cpp:144] Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]: Traceback (most recent call last):
[rank1]: File "/root/test/xtuner/xtuner/tools/train.py", line 360, in
[rank1]: main()
[rank1]: File "/root/test/xtuner/xtuner/tools/train.py", line 356, in main
[rank1]: runner.train()
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank1]: self._train_loop = self.build_train_loop(
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank1]: loop = LOOPS.build(
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]: return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]: obj = obj_cls(**args) # type: ignore
[rank1]: File "/root/test/xtuner/xtuner/engine/runner/loops.py", line 32, in init
[rank1]: dataloader = runner.build_dataloader(
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank1]: dataset = DATASETS.build(dataset_cfg)
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]: return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]: obj = obj_cls(**args) # type: ignore
[rank1]: File "/root/test/xtuner/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank1]: dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank1]: File "/root/.conda/envs/test/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank1]: return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank1]: RuntimeError: Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]: Original exception:
[rank1]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [192.168.239.202]:35230
E0526 11:02:25.394000 139939038151872 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 7090) of binary: /root/.conda/envs/test/bin/python
Traceback (most recent call last):
File "/root/.conda/envs/test/bin/torchrun", line 8, in
sys.exit(main())
File "/root/.conda/envs/test/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f(*args, **kwargs)
File "/root/.conda/envs/test/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
run(args)
File "/root/.conda/envs/test/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
elastic_launch(
File "/root/.conda/envs/test/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/.conda/envs/test/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/root/test/xtuner/xtuner/tools/train.py FAILED
Failures: [1]: time : 2024-05-26_11:02:25 host : intern-studio-083870 rank : 1 (local_rank: 1) exitcode : 1 (pid: 7091) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2024-05-26_11:02:25 host : intern-studio-083870 rank : 0 (local_rank: 0) exitcode : 1 (pid: 7090) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html