ms-swift icon indicating copy to clipboard operation
ms-swift copied to clipboard

不支持bf16报错

Open jfy1016 opened this issue 7 months ago • 2 comments

运行脚本就是官方给出的示例脚本 import os os.environ['CUDA_VISIBLE_DEVICES'] = '2'

from swift.llm import ( get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch, get_multimodal_target_regex, LazyLLMDataset ) from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everything from swift.tuners import Swift, LoraConfig from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments from functools import partial

logger = get_logger() seed_everything(42)

Hyperparameters for training

model

model_id_or_path = '/home/jdn/.cache/modelscope/hub/models/deepseek-ai/deepseek-vl2-tiny' system = None # Using the default system defined in the template. output_dir = '/home/jdn/deepseek/output'

dataset

dataset = '/home/jdn/train_CT_and_Xray_last_500.json' # dataset_id or dataset_path. Sampling 20000 data points data_seed = 42 max_length = 2048 split_dataset_ratio = 0.01 # Split validation set num_proc = 4 # The number of processes for data loading.

lora

lora_rank = 8 lora_alpha = 32 freeze_llm = False freeze_vit = True freeze_aligner = True

training_args

training_args = Seq2SeqTrainingArguments( output_dir=output_dir,

learning_rate=1e-4,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_checkpointing=True,
weight_decay=0.1,
lr_scheduler_type='cosine',
warmup_ratio=0.05,
report_to=['tensorboard'],
logging_first_step=True,
save_strategy='steps',
save_steps=50,
eval_strategy='steps',
eval_steps=50,
gradient_accumulation_steps=16,
# To observe the training results more quickly, this is set to 1 here. 
# Under normal circumstances, a larger number should be used.
num_train_epochs=1,
metric_for_best_model='loss',
save_total_limit=5,
logging_steps=5,
dataloader_num_workers=4,
data_seed=data_seed,
remove_unused_columns=False,

)

output_dir = os.path.abspath(os.path.expanduser(output_dir)) logger.info(f'output_dir: {output_dir}')

Obtain the model and template

model, processor = get_model_tokenizer(model_id_or_path) #model.half()#jdn修改 logger.info(f'model_info: {model.model_info}') template = get_template(model.model_meta.template, processor, default_system=system, max_length=max_length) template.set_mode('train') if template.use_model: template.model = model

Get target_modules and add trainable LoRA modules to the model.

target_modules = get_multimodal_target_regex(model, freeze_llm=freeze_llm, freeze_vit=freeze_vit, freeze_aligner=freeze_aligner) lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha, target_modules=target_modules) model = Swift.prepare_model(model, lora_config) logger.info(f'lora_config: {lora_config}')

Print model structure and trainable parameters.

logger.info(f'model: {model}') model_parameter_info = get_model_parameter_info(model) logger.info(f'model_parameter_info: {model_parameter_info}')

Download and load the dataset, split it into a training set and a validation set,

and encode the text data into tokens.

train_dataset, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc, seed=data_seed)

logger.info(f'train_dataset: {train_dataset}') logger.info(f'val_dataset: {val_dataset}') logger.info(f'train_dataset[0]: {train_dataset[0]}')

train_dataset = LazyLLMDataset(train_dataset, template.encode, random_state=data_seed) val_dataset = LazyLLMDataset(val_dataset, template.encode, random_state=data_seed) data = train_dataset[0] logger.info(f'encoded_train_dataset[0]: {data}')

template.print_inputs(data)

Get the trainer and start the training.

model.enable_input_require_grads() # Compatible with gradient checkpointing trainer = Seq2SeqTrainer(

model=model,
args=training_args,
data_collator=template.data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
template=template,

) trainer.train()

last_model_checkpoint = trainer.state.last_model_checkpoint logger.info(f'last_model_checkpoint: {last_model_checkpoint}')

Visualize the training loss.

You can also use the TensorBoard visualization interface during training by entering

tensorboard --logdir '{output_dir}/runs' at the command line.

images_dir = os.path.join(output_dir, 'images') logger.info(f'images_dir: {images_dir}') plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9) # save images

Read and display the image.

The light yellow line represents the actual loss value,

while the yellow line represents the loss value smoothed with a smoothing factor of 0.9.

from IPython.display import display from PIL import Image image = Image.open(os.path.join(images_dir, 'train_loss.png')) display(image)

报错为

Image请问如何修改代码能讲bf16 切换为float16

jfy1016 avatar Apr 29 '25 05:04 jfy1016

设备不支持的问题,建议买新设备

Jintao-Huang avatar Apr 29 '25 13:04 Jintao-Huang

--torch_dtype float16

Jintao-Huang avatar May 03 '25 15:05 Jintao-Huang

--torch_dtype float16

请问,如何将该参数写入python脚本中,训练环境仅支持在python脚本中添加配置

YUANMU227 avatar May 16 '25 17:05 YUANMU227

get_model_tokenizer 中传入

Jintao-Huang avatar Jun 04 '25 08:06 Jintao-Huang