PaddleMIX
PaddleMIX copied to clipboard
在NPU上进行LLaVA1.6微调训练时遇到问题
Paddle版本:
python -c "import paddle; print(paddle.version.commit)"
CustomDevice: npu, visible devices count: 2
2ef8abae65f11fa3cdae784b4ac58750e0fa3bbb
CANN版本:8.0.RC1
操作系统版本:Ubuntu 20.04.3 LTS
lora_stf_argument.json
为:
{
"model_name_or_path": "paddlemix/llava/llava-v1.6-vicuna-7b",
"dataset": {
"train":[{"name": "chatml_dataset", "data_files": "data/train.json","chat_template":"data/llava_chat_template.json"}],
"eval":[{"name": "chatml_dataset", "data_files": "data/train.json","chat_template":"data/llava_chat_template.json"}]
},
"mixtoken": false,
"output_dir": "./checkpoints/llava_sft_ckpts",
"device": "npu",
"overwrite_output_dir": true,
"recompute": true,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 1,
"per_device_eval_batch_size": 1,
"eval_accumulation_steps":16,
"num_train_epochs": 1,
"learning_rate": 2e-04,
"mm_projector_lr": 2e-5,
"weight_decay": 0.0,
"warmup_ratio": 0.03,
"lr_scheduler_type": "cosine",
"logging_steps": 1,
"save_steps": 1000,
"evaluation_strategy": "no",
"save_strategy": "steps",
"max_length": 256,
"bf16": false,
"fp16": true,
"fp16_opt_level": "O1",
"do_train": true,
"do_eval": false,
"disable_tqdm": false,
"load_best_model_at_end": false,
"eval_with_do_generation": false,
"skip_memory_metrics": false,
"benchmark": false,
"save_total_limit": 1,
"lora": true,
"lora_rank": 128,
"lora_alpha": 256,
"lora_dropout": 0.0,
"lora_target_modules":["llama.layer.*q_proj.*",
"llama.layer.*k_proj.*",
"llama.layer.*v_proj.*",
"llama.layer.*gate_proj.*",
"llama.layer.*up_proj.*",
"llama.layer.*down_proj.*",
"llama.layer.*o_proj.*"]
}
- 并行训练时遇到的问题: 训练脚本:
python -u -m paddle.distributed.launch --npus "0,1" paddlemix/tools/supervised_finetune.py paddlemix/config/llava/v1_6/lora_sft_argument.json
报错信息如下:
Call HcclAllReduce(send_buf, recv_buf, count, PDDataTypeToHcclDataType(data_type), PDReduceOpToHcclReduceOp(op), reinterpret_cast<HcclComm>(comm), reinterpret_cast<aclrtStream>(stream)) failed : 5 at file /root/PaddleCustomDevice/backends/npu/runtime/runtime.cc line 881
E40024: 2024-07-26-11:47:38.607.914 Failed call Python Func/Meathod [get_binfile_sha256_hash_from_c], Reason[SystemError: PY_SSIZE_T_CLEAN macro must be defined for '#' formats
]
Possible Cause: The Python Func/Meathod does not exist.
LAUNCH INFO 2024-07-26 11:57:05,580 Exit code -11
- 单卡训练时遇到的问题:
训练代码会在中途卡住不动,不清楚是什么原因造成的。