Question
(ll) root@platform:/workspace/dhl/LLaVA# bash /workspace/dhl/LLaVA/scripts/v1_5/finetune.sh
[2023-12-06 16:23:00,608] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-12-06 16:23:07,002] [WARNING] [runner.py:203:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2023-12-06 16:23:07,004] [INFO] [runner.py:570:main] cmd = /usr/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None llava/train/train_mem.py --deepspeed ./scripts/zero3.json --model_name_or_path /workspace/dhl/model/llava-v1.5-13b --version v1 --data_path /workspace/autoglm/dataset/img/mind2web/subtree/train/all.json --image_folder /workspace/autoglm/dataset/img/img_mark --vision_tower openai/clip-vit-large-patch14-336 --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin --mm_projector_type mlp2x_gelu --mm_vision_select_layer -2 --mm_use_im_start_end False --mm_use_im_patch_token False --image_aspect_ratio pad --group_by_modality_length True --bf16 True --output_dir ./checkpoints/llava-v1.5-13b --num_train_epochs 1 --per_device_train_batch_size 16 --per_device_eval_batch_size 4 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 50000 --save_total_limit 1 --learning_rate 2e-5 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type cosine --logging_steps 1 --tf32 True --model_max_length 2048 --gradient_checkpointing True --dataloader_num_workers 4 --lazy_preprocess True --report_to wandb
[2023-12-06 16:23:08,719] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-12-06 16:23:14,892] [INFO] [launch.py:138:main] 0 NCCL_IB_PCI_RELAXED_ORDERING=1
[2023-12-06 16:23:14,892] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.19.3
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_SOCKET_IFNAME=eth0
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_NVLS_ENABLE=1
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_DEBUG=VERSION
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_IB_GID_INDEX=97
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=23
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_IB_DISABLE=0
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_IB_RETRY_CNT=7
[2023-12-06 16:23:14,893] [INFO] [launch.py:138:main] 0 NCCL_CROSS_NIC=0
[2023-12-06 16:23:14,893] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
[2023-12-06 16:23:14,893] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=8, node_rank=0
[2023-12-06 16:23:14,893] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
[2023-12-06 16:23:14,893] [INFO] [launch.py:163:main] dist_world_size=8
[2023-12-06 16:23:14,893] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py", line 1099, in _get_module
return importlib.import_module("." + module_name, self.name)
File "/usr/lib/python3.10/importlib/init.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1050, in _gcd_import
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 32, in
from ...modeling_utils import PreTrainedModel
File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 38, in
from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
File "/usr/local/lib/python3.10/dist-packages/transformers/deepspeed.py", line 37, in
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
File "/usr/local/lib/python3.10/dist-packages/accelerate/init.py", line 3, in
from .accelerator import Accelerator
File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 35, in
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
File "/usr/local/lib/python3.10/dist-packages/accelerate/checkpointing.py", line 24, in
from .utils import (
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/init.py", line 133, in
from .launch import (
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/launch.py", line 33, in
from ..utils.other import is_port_in_use, merge_dicts
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/other.py", line 25, in
from .transformer_engine import convert_model
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/transformer_engine.py", line 21, in
import transformer_engine.pytorch as te
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/init.py", line 6, in
from .module import LayerNormLinear
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/module/init.py", line 6, in
from .layernorm_linear import LayerNormLinear
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/module/layernorm_linear.py", line 15, in
from .. import cpp_extensions as tex
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/cpp_extensions/init.py", line 6, in
from transformer_engine_extensions import *
ImportError: /usr/local/lib/python3.10/dist-packages/transformer_engine_extensions.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c10ltERKNS_6SymIntEi
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/workspace/dhl/LLaVA/llava/train/train_mem.py", line 6, in
from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
File "/workspace/dhl/LLaVA/llava/init.py", line 1, in
from .model import LlavaLlamaForCausalLM
File "/workspace/dhl/LLaVA/llava/model/init.py", line 1, in
from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
File "/workspace/dhl/LLaVA/llava/model/language_model/llava_llama.py", line 21, in
from transformers import AutoConfig, AutoModelForCausalLM,
File "", line 1075, in _handle_fromlist
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py", line 1090, in getattr
value = getattr(module, name)
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py", line 1089, in getattr
module = self._get_module(self._class_to_module[name])
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py", line 1101, in _get_module
raise RuntimeError(
RuntimeError: Failed to import transformers.models.llama.modeling_llama because of the following error (look up to see its traceback):
/usr/local/lib/python3.10/dist-packages/transformer_engine_extensions.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c10ltERKNS_6SymIntEi
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py", line 1099, in _get_module
return importlib.import_module("." + module_name, self.name)
File "/usr/lib/python3.10/importlib/init.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1050, in _gcd_import
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 32, in
from ...modeling_utils import PreTrainedModel
File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 38, in
from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
File "/usr/local/lib/python3.10/dist-packages/transformers/deepspeed.py", line 37, in
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
File "/usr/local/lib/python3.10/dist-packages/accelerate/init.py", line 3, in
from .accelerator import Accelerator
File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 35, in
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
File "/usr/local/lib/python3.10/dist-packages/accelerate/checkpointing.py", line 24, in
from .utils import (
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/init.py", line 133, in
from .launch import (
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/launch.py", line 33, in
from ..utils.other import is_port_in_use, merge_dicts
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/other.py", line 25, in
from .transformer_engine import convert_model
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/transformer_engine.py", line 21, in
import transformer_engine.pytorch as te
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/init.py", line 6, in
from .module import LayerNormLinear
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/module/init.py", line 6, in
from .layernorm_linear import LayerNormLinear
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/module/layernorm_linear.py", line 15, in
from .. import cpp_extensions as tex
File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/cpp_extensions/init.py", line 6, in
from transformer_engine_extensions import *
ImportError: /usr/local/lib/python3.10/dist-packages/transformer_engine_extensions.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c10ltERKNS_6SymIntEi
The above exception was the direct cause of the following exception: