trafficstars
🐛 Describe the bug
My execution command:torchrun --standalone --nproc_per_node=4 ./examples/train_sft.py --pretrain "/share/disk1/xiangchaoqi/gpt_test/pytorch_model-00033-of-00033.bin" --model 'llama' --strategy colossalai_zero2 --log_interval 10 --save_path output/Coati-7B --dataset /share/disk1/xiangchaoqi/gpt_test/instinwild_ch.json --batch_size 4 --accimulation_steps 8 --lr 2e-5 --max_datasets_size 512 --max_epochs 1
Execution path:ColossalAI/applications/Chat
log as below
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last):
File "./examples/train_sft.py", line 7, in
from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/init.py", line 1, in
from .prompt_dataset import PromptDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/prompt_dataset.py", line 12, in
from colossalai.logging import get_dist_logger
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/init.py", line 1, in
from .initialize import (
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/initialize.py", line 18, in
from colossalai.amp import AMP_TYPE, convert_to_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/init.py", line 11, in
from .apex_amp import convert_to_apex_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/init.py", line 4, in
from .apex_amp import ApexAMPOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/apex_amp.py", line 13, in
from colossalai.nn.optimizer import ColossalaiOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/init.py", line 1, in
from ._ops import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/init.py", line 1, in
from .addmm import colo_addmm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/addmm.py", line 6, in
from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/_utils.py", line 7, in
from colossalai.nn.layer.utils import divide
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/init.py", line 1, in
from .colossalai_layer import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/init.py", line 2, in
from .dropout import Dropout
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/dropout.py", line 5, in
from ..parallel_1d import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/init.py", line 1, in
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/layers.py", line 17, in
from colossalai.kernel import LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/init.py", line 1, in
from .cuda_native import FusedScaleMaskSoftmax, LayerNorm, MultiHeadAttention
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/init.py", line 1, in
from .layer_norm import MixedFusedLayerNorm as LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/layer_norm.py", line 12, in
from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
ModuleNotFoundError: No module named 'colossalai.kernel.op_builder'
Traceback (most recent call last):
File "./examples/train_sft.py", line 7, in
from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/init.py", line 1, in
from .prompt_dataset import PromptDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/prompt_dataset.py", line 12, in
from colossalai.logging import get_dist_logger
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/init.py", line 1, in
from .initialize import (
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/initialize.py", line 18, in
from colossalai.amp import AMP_TYPE, convert_to_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/init.py", line 11, in
from .apex_amp import convert_to_apex_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/init.py", line 4, in
from .apex_amp import ApexAMPOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/apex_amp.py", line 13, in
from colossalai.nn.optimizer import ColossalaiOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/init.py", line 1, in
from ._ops import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/init.py", line 1, in
from .addmm import colo_addmm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/addmm.py", line 6, in
from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/_utils.py", line 7, in
from colossalai.nn.layer.utils import divide
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/init.py", line 1, in
from .colossalai_layer import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/init.py", line 2, in
from .dropout import Dropout
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/dropout.py", line 5, in
from ..parallel_1d import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/init.py", line 1, in
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/layers.py", line 17, in
from colossalai.kernel import LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/init.py", line 1, in
from .cuda_native import FusedScaleMaskSoftmax, LayerNorm, MultiHeadAttention
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/init.py", line 1, in
from .layer_norm import MixedFusedLayerNorm as LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/layer_norm.py", line 12, in
from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
ModuleNotFoundError: No module named 'colossalai.kernel.op_builder'
Traceback (most recent call last):
File "./examples/train_sft.py", line 7, in
from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/init.py", line 1, in
from .prompt_dataset import PromptDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/prompt_dataset.py", line 12, in
from colossalai.logging import get_dist_logger
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/init.py", line 1, in
from .initialize import (
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/initialize.py", line 18, in
from colossalai.amp import AMP_TYPE, convert_to_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/init.py", line 11, in
from .apex_amp import convert_to_apex_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/init.py", line 4, in
from .apex_amp import ApexAMPOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/apex_amp.py", line 13, in
from colossalai.nn.optimizer import ColossalaiOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/init.py", line 1, in
from ._ops import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/init.py", line 1, in
from .addmm import colo_addmm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/addmm.py", line 6, in
from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/_utils.py", line 7, in
from colossalai.nn.layer.utils import divide
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/init.py", line 1, in
from .colossalai_layer import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/init.py", line 2, in
from .dropout import Dropout
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/dropout.py", line 5, in
from ..parallel_1d import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/init.py", line 1, in
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/layers.py", line 17, in
from colossalai.kernel import LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/init.py", line 1, in
from .cuda_native import FusedScaleMaskSoftmax, LayerNorm, MultiHeadAttention
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/init.py", line 1, in
from .layer_norm import MixedFusedLayerNorm as LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/layer_norm.py", line 12, in
from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
ModuleNotFoundError: No module named 'colossalai.kernel.op_builder'
Traceback (most recent call last):
File "./examples/train_sft.py", line 7, in
from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/init.py", line 1, in
from .prompt_dataset import PromptDataset
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/coati/dataset/prompt_dataset.py", line 12, in
from colossalai.logging import get_dist_logger
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/init.py", line 1, in
from .initialize import (
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/initialize.py", line 18, in
from colossalai.amp import AMP_TYPE, convert_to_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/init.py", line 11, in
from .apex_amp import convert_to_apex_amp
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/init.py", line 4, in
from .apex_amp import ApexAMPOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/amp/apex_amp/apex_amp.py", line 13, in
from colossalai.nn.optimizer import ColossalaiOptimizer
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/init.py", line 1, in
from ._ops import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/init.py", line 1, in
from .addmm import colo_addmm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/addmm.py", line 6, in
from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/_ops/_utils.py", line 7, in
from colossalai.nn.layer.utils import divide
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/init.py", line 1, in
from .colossalai_layer import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/init.py", line 2, in
from .dropout import Dropout
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/dropout.py", line 5, in
from ..parallel_1d import *
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/init.py", line 1, in
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/layers.py", line 17, in
from colossalai.kernel import LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/init.py", line 1, in
from .cuda_native import FusedScaleMaskSoftmax, LayerNorm, MultiHeadAttention
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/init.py", line 1, in
from .layer_norm import MixedFusedLayerNorm as LayerNorm
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/colossalai/kernel/cuda_native/layer_norm.py", line 12, in
from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
ModuleNotFoundError: No module named 'colossalai.kernel.op_builder'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2124) of binary: /root/anaconda3/envs/coati/bin/python
Traceback (most recent call last):
File "/root/anaconda3/envs/coati/bin/torchrun", line 8, in
sys.exit(main())
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/anaconda3/envs/coati/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Environment
pip list:
aiohttp 3.8.4
aiosignal 1.3.1
anyio 3.6.2
appdirs 1.4.4
async-timeout 4.0.2
attrs 22.2.0
bcrypt 4.0.1
blessed 1.20.0
certifi 2022.5.18.1
cffi 1.15.1
cfgv 3.3.1
charset-normalizer 3.1.0
click 8.1.3
cmake 3.26.3
coati 1.0.0
colossalai 0.2.8
contexttimer 0.3.3
cryptography 40.0.1
dataclasses-json 0.5.7
datasets 2.11.0
dill 0.3.6
distlib 0.3.6
docker-pycreds 0.4.0
fabric 3.0.0
fastapi 0.95.0
filelock 3.11.0
frozenlist 1.3.3
fsspec 2023.4.0
gitdb 4.0.10
GitPython 3.1.31
gpustat 1.1
greenlet 2.0.2
huggingface-hub 0.13.4
identify 2.5.22
idna 3.4
invoke 2.0.0
Jinja2 3.1.2
langchain 0.0.138
lit 16.0.1
loralib 0.1.1
markdown-it-py 2.2.0
MarkupSafe 2.1.2
marshmallow 3.19.0
marshmallow-enum 1.5.1
mdurl 0.1.2
mpmath 1.3.0
multidict 6.0.4
multiprocess 0.70.14
mypy-extensions 1.0.0
networkx 3.1
ninja 1.11.1
nodeenv 1.7.0
numpy 1.24.2
nvidia-cublas-cu11 11.10.3.66
nvidia-cuda-cupti-cu11 11.7.101
nvidia-cuda-nvrtc-cu11 11.7.99
nvidia-cuda-runtime-cu11 11.7.99
nvidia-cudnn-cu11 8.5.0.96
nvidia-cufft-cu11 10.9.0.58
nvidia-curand-cu11 10.2.10.91
nvidia-cusolver-cu11 11.4.0.1
nvidia-cusparse-cu11 11.7.4.91
nvidia-ml-py 11.525.112
nvidia-nccl-cu11 2.14.3
nvidia-nvtx-cu11 11.7.91
openapi-schema-pydantic 1.2.4
packaging 23.1
pandas 2.0.0
paramiko 3.1.0
pathtools 0.1.2
pip 21.2.4
platformdirs 3.2.0
pre-commit 3.2.2
protobuf 4.22.3
psutil 5.9.4
pyarrow 11.0.0
pycparser 2.21
pydantic 1.10.7
Pygments 2.15.0
PyNaCl 1.5.0
python-dateutil 2.8.2
pytz 2023.3
PyYAML 6.0
regex 2023.3.23
requests 2.28.2
responses 0.18.0
rich 13.3.4
safetensors 0.3.0
sentencepiece 0.1.98
sentry-sdk 1.19.1
setproctitle 1.3.2
setuptools 61.2.0
six 1.16.0
smmap 5.0.0
sniffio 1.3.0
SQLAlchemy 1.4.47
sse-starlette 1.3.3
starlette 0.26.1
sympy 1.11.1
tenacity 8.2.2
tokenizers 0.13.3
torch 1.13.1
tqdm 4.65.0
transformers 4.28.0.dev0
triton 2.0.0
typing_extensions 4.5.0
typing-inspect 0.8.0
tzdata 2023.3
urllib3 1.26.15
virtualenv 20.21.0
wandb 0.14.2
wcwidth 0.2.6
wheel 0.37.1
xxhash 3.2.0
yarl 1.8.2