🐛 Describe the bug
When I run train_dummy.py,Such an error occurs:
Traceback (most recent call last):
File "/home/whong/ChatGPT/examples/train_dummy.py", line 9, in
from chatgpt.trainer import PPOTrainer
File "/home/whong/ChatGPT/chatgpt/trainer/init.py", line 1, in
from .base import Trainer
File "/home/whong/ChatGPT/chatgpt/trainer/base.py", line 11, in
from .callbacks import Callback
File "/home/whong/ChatGPT/chatgpt/trainer/callbacks/init.py", line 3, in
from .save_checkpoint import SaveCheckpoint
File "/home/whong/ChatGPT/chatgpt/trainer/callbacks/save_checkpoint.py", line 4, in
from chatgpt.trainer.strategies import ColossalAIStrategy, Strategy
File "/home/whong/ChatGPT/chatgpt/trainer/strategies/init.py", line 2, in
from .colossalai import ColossalAIStrategy
File "/home/whong/ChatGPT/chatgpt/trainer/strategies/colossalai.py", line 11, in
import colossalai
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/init.py", line 1, in
from .initialize import (
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/initialize.py", line 18, in
from colossalai.amp import AMP_TYPE, convert_to_amp
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/amp/init.py", line 9, in
from .torch_amp import convert_to_torch_amp
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/amp/torch_amp/init.py", line 9, in
from .torch_amp import TorchAMPLoss, TorchAMPModel, TorchAMPOptimizer
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/amp/torch_amp/torch_amp.py", line 10, in
from colossalai.nn.optimizer import ColossalaiOptimizer
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/init.py", line 1, in
from ._ops import *
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/_ops/init.py", line 1, in
from .addmm import colo_addmm
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/_ops/addmm.py", line 5, in
from ._utils import GeneralTensor, Number, convert_to_colo_tensor
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/_ops/_utils.py", line 8, in
from colossalai.nn.layer.utils import divide
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/layer/init.py", line 1, in
from .colossalai_layer import *
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/layer/colossalai_layer/init.py", line 1, in
from ._utils import partition_batch
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/layer/colossalai_layer/_utils.py", line 4, in
from ..parallel_2d._operation import split_batch_2d
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/layer/parallel_2d/init.py", line 1, in
from ._operation import reduce_by_batch_2d, split_batch_2d
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/nn/layer/parallel_2d/_operation.py", line 5, in
from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/communication/init.py", line 1, in
from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
File "/home/whong/anaconda3/envs/ColossalAI/lib/python3.9/site-packages/colossalai/communication/collective.py", line 12, in
_all_gather_func = dist._all_gather_base
AttributeError: module 'torch.distributed' has no attribute '_all_gather_base'
Environment
Commands running on the server:
import torch
print(torch.version)
1.8.0
print(torch.version.cuda)
11.1
$ conda list
torch 1.13.1
cudatoolkit 11.1.1
Hi, your torch version is a bit too old (1.8). Please upgrade the torch, or use the one installed in your conda environment.
Mar 16
'23 06:03
JThh