MFTCoder
MFTCoder copied to clipboard
实验 MFTCoder 的效果总是不尽人意
你好,我使用了三个开源问答数据集,分别为 Squad、Commonsense、ai2_arc 来进行多任务微调,但我迄今为止微调三次试验结果都不如单次微调甚至不如混合微调,我使用 mistral_v0.2 模型对三个数据集分别进行微调、混合微调、不微调、MFTcoder 微调进行比较,下面是我的实验结果:
所以我在怀疑是我微调参数是否出了问题,或者说这三个问答子任务相关性并不强所以导致这种现象。所以在此提出 issue 来向你们请教。 此外你的配置选项中有一些参数并没有说明,例如 data weight 等,我觉得对于框架使用者来说还是完善一些更好。
下面是我的第三次 mftcoder 微调配置:
{
"data_paths": "[data/ai2_arc,data/squad,data/commonsense_qa]",
"output_dir": "model/mistral/single_train_2024-06-13/lora_adaptor",
"tb_dir": "model/mistral/single_train_2024-06-13/tensorboard",
"pretrained_model_path": "AI-ModelScope/Mistral-7B-Instruct-v0___2",
"model_type": "mistral",
"load_raw_dataset": true,
"data_split": "98,2,0",
"padding_mode": "padding",
"use_dynamic_padding": true,
"tokenize_mode": "sft",
"tokenizer_type": "AutoTokenizer",
"weighted_loss_mode": "case3",
"attn_implementation": "flash_attention_2",
"seq_length": 1024,
"seed": 1234,
"peft_type": "lora",
"quantization": null,
"lora_rank": 64,
"lora_alpha": 128,
"lora_dropout": 0.05,
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 32,
"learning_rate": 5e-5,
"min_lr": 5e-6,
"weight_decay": 0.1,
"gradient_accumulation_steps": 2,
"lr_scheduler_type": "cosine",
"num_warmup_steps": 300,
"num_train_epochs": 2,
"log_interval": 10,
"checkpointing_steps": 100,
"evaluation_steps": 100,
"max_train_steps": null,
"epoch_checkpointing": true,
"shuffle_before_split": true,
"early_stopping": true,
"early_stopping_stall_num": 5,
"saving_limit": null,
"target_modules":["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
}