finetune with lora
CUDA_VISIBLE_DEVICES="2,3,4,5,6,7" torchrun --nnodes=1 --nproc_per_node=6
fastchat/train/train_lora.py
--model_name_or_path vicuna/vicuna-7b
--data_path vicuna/data/data.json
--fp16
--report_to none
--output_dir ./checkpoints
--num_train_epochs 3
--per_device_train_batch_size 1
--per_device_eval_batch_size 1
--gradient_accumulation_steps 1
--evaluation_strategy "no"
--save_strategy "steps"
--save_steps 1200
--save_total_limit 100
--learning_rate 2e-5
--weight_decay 0.
--warmup_ratio 0.03
--lr_scheduler_type "cosine"
--logging_steps 1
--fsdp "full_shard offload auto_wrap"
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer'
--model_max_length 2048
--gradient_checkpointing True
--lazy_preprocess True
~
i got this error:
Traceback (most recent call last):
File "fastchat/train/train_lora.py", line 109, in
Traceback (most recent call last):
File "fastchat/train/train_lora.py", line 109, in
train()
File "fastchat/train/train_lora.py", line 103, in train
trainer.train()
File "/HanLP3/vicuna/transformers/src/transformers/trainer.py", line 1662, in train
return inner_training_loop(
File "/HanLP3/vicuna/transformers/src/transformers/trainer.py", line 1749, in _inner_training_loop
model = self._wrap_model(self.model_wrapped)
File "/HanLP3/vicuna/transformers/src/transformers/trainer.py", line 1489, in _wrap_model
self.model = model = FSDP(
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 1036, in init
self._auto_wrap(auto_wrap_kwargs, fsdp_kwargs)
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 1291, in _auto_wrap
_recursive_wrap(**auto_wrap_kwargs, **fsdp_kwargs)
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/wrap.py", line 403, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/wrap.py", line 403, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/wrap.py", line 403, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
[Previous line repeated 2 more times]
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/wrap.py", line 421, in _recursive_wrap
return _wrap(module, wrapper_cls, **kwargs), num_params
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/wrap.py", line 350, in _wrap
return wrapper_cls(module, **kwargs)
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 1079, in init
self._fsdp_wrapped_module = FlattenParamsWrapper(
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/flatten_params_wrapper.py", line 103, in init
self._flat_param_handle = FlatParamHandle(params, module, device, config)
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/flat_param.py", line 270, in init
self._init_flat_param(params, module)
File "/opt/conda/envs/vicuna/lib/python3.8/site-packages/torch/distributed/fsdp/flat_param.py", line 338, in _init_flat_param
raise ValueError(
ValueError: FlatParameter
requires uniform requires_grad
I have the same problem. +1
Seems FSDP is still developing its support for parameter-efficient training. The author of lora support suggest to use deepspeed at this moment. Check https://github.com/lm-sys/FastChat/pull/138#issuecomment-1495289110 for more details
We won't fix this in the short term. Because this is a complicated lack of support in FSDP. It is hard for us to fix FSDP. We suggest you use DeepSpeed or other libraries for PEFT. Closing.