LLM-Tuning icon indicating copy to clipboard operation
LLM-Tuning copied to clipboard

chaglm-6b lora微调执行到指定的eval_step后提示“iteration over a 0-d tensor”

Open LivinLuo1993 opened this issue 1 year ago • 1 comments

chaglm-6b lora微调执行到指定的eval_step后提示“iteration over a 0-d tensor”,故障如下所示: image

代码如下: `def train_v2(model, train_data, val_data): writer = SummaryWriter()

world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1

train_args = TrainingArguments(
    output_dir=args.output_path,
    do_train=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=3e-4,
    lr_scheduler_type="linear",
    warmup_ratio=0.05,
    max_steps=max_train_steps,
    fp16=True,
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    evaluation_strategy="steps" if args.test_size > 0 else "no",
    save_strategy="steps",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    ddp_find_unused_parameters=False if ddp else None,
    ignore_data_skip=False,
    seed=10,
    data_seed=10,
    group_by_length=False
    # deepspeed="./config/ds_config.json"
)

trainer = ModifiedTrainer(
    model=model,
    # optimizers=(optimizer, lr_scheduler),
    train_dataset=train_data,
    eval_dataset=val_data,
    args=train_args,
    callbacks=[TensorBoardCallback(writer)],
    # data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    data_collator=data_collator
)

old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
).__get__(model, type(model))

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

print("\n If there's a warning about missing keys above, please disregard :)")

# trainer.train(resume_from_checkpoint=args.resume_from_checkpoint)
trainer.train()
writer.close()

model.save_pretrained(args.output_path)

model = AutoModel.from_pretrained( args.model_name_or_path, config=config, # load_in_8bit=True, torch_dtype=torch.float16, device_map=device_map, trust_remote_code=True, revision="", )

model = model.half()

model.supports_gradient_checkpointing = True model.gradient_checkpointing_enable() model.enable_input_require_grads() model.config.use_cache = False

model.lm_head = CastOutputToFloat(model.lm_head) peft_config = LoraConfig( task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, ) model = get_peft_model(model, peft_config) model.print_trainable_parameters()

train_data = load_from_disk(args.data_path) train_v2(model, train_data, None)`

LivinLuo1993 avatar Aug 18 '23 10:08 LivinLuo1993

修改ModifiedTrainer部分:

class ModifiedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs =  model(
            input_ids=inputs["input_ids"],
            labels=inputs["labels"],
        )
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

    def save_model(self, output_dir=None, _internal_call=False):
        self.model.save_pretrained(output_dir)

beyondguo avatar Oct 01 '23 05:10 beyondguo