unsloth More problems with train_on_responses

I'm trying to finetune Mistral-Nemo-Base-2407 with a text dataset of long inputs. Usually, the SFTrainer will truncate it to fit the specified context size.

However, I get an error when using train_on_responses_only.

Running the same dataset without train_on_responses_only works fine and trains normally.

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,068 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 258
 "-____-"     Number of trainable parameters = 57,016,320
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:762, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
    761 if not is_tensor(value):
--> 762     tensor = as_tensor(value)
    764     # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
    765     # # at-least2d
    766     # if tensor.ndim > 2:
    767     #     tensor = tensor.squeeze(0)
    768     # elif tensor.ndim < 2:
    769     #     tensor = tensor[None, :]

File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:724, in BatchEncoding.convert_to_tensors.<locals>.as_tensor(value, dtype)
    723     return torch.tensor(np.array(value))
--> 724 return torch.tensor(value)

ValueError: expected sequence of length 4096 at dim 1 (got 327)

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
Cell In[1], line 268
    258 trainer = train_on_responses_only(
    259     trainer,
    260     instruction_part = "### Instruction:\n",
    261     response_part = "### Response:\n",
    262 )
    264 # #sanity check
    265 # space = tokenizer(" ", add_special_tokens = False).input_ids[0]
    266 # tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]])
--> 268 trainer_stats = trainer.train()
    270 used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    271 used_memory_for_lora = round(used_memory - start_gpu_memory, 3)

File <string>:145, in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)

File <string>:320, in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)

File /usr/local/lib/python3.10/dist-packages/accelerate/data_loader.py:550, in DataLoaderShard.__iter__(self)
    548 # We iterate one batch ahead to check when we are at the end
    549 try:
--> 550     current_batch = next(dataloader_iter)
    551 except StopIteration:
    552     yield

File /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
    627 if self._sampler_iter is None:
    628     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    629     self._reset()  # type: ignore[call-arg]
--> 630 data = self._next_data()
    631 self._num_yielded += 1
    632 if self._dataset_kind == _DatasetKind.Iterable and \
    633         self._IterableDataset_len_called is not None and \
    634         self._num_yielded > self._IterableDataset_len_called:

File /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
    672 def _next_data(self):
    673     index = self._next_index()  # may raise StopIteration
--> 674     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    675     if self._pin_memory:
    676         data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
     52 else:
     53     data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)

File /usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py:45, in DataCollatorMixin.__call__(self, features, return_tensors)
     43     return self.tf_call(features)
     44 elif return_tensors == "pt":
---> 45     return self.torch_call(features)
     46 elif return_tensors == "np":
     47     return self.numpy_call(features)

File /usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py:806, in DataCollatorForLanguageModeling.torch_call(self, examples)
    803 def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
    804     # Handle dict or lists with proper padding and conversion to tensor.
    805     if isinstance(examples[0], Mapping):
--> 806         batch = pad_without_fast_tokenizer_warning(
    807             self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
    808         )
    809     else:
    810         batch = {
    811             "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
    812         }

File /usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py:66, in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
     63 tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
     65 try:
---> 66     padded = tokenizer.pad(*pad_args, **pad_kwargs)
     67 finally:
     68     # Restore the state of the warning.
     69     tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state

File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:3560, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
   3557             batch_outputs[key] = []
   3558         batch_outputs[key].append(value)
-> 3560 return BatchEncoding(batch_outputs, tensor_type=return_tensors)

File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:227, in BatchEncoding.__init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
    223     n_sequences = encoding[0].n_sequences
    225 self._n_sequences = n_sequences
--> 227 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:778, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
    773         if key == "overflowing_tokens":
    774             raise ValueError(
    775                 "Unable to create tensor returning overflowing tokens of different lengths. "
    776                 "Please see if a fast version of this tokenizer is available to have this feature available."
    777             ) from e
--> 778         raise ValueError(
    779             "Unable to create tensor, you should probably activate truncation and/or padding with"
    780             " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
    781             f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
    782             " expected)."
    783         ) from e
    785 return self

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

Any help would be appreciated.

Sep 12 '24 08:09 LostRuins

@LostRuins Apologies on the delay - it seems like it's saying the labels are nested? Would it be possible to print out maybe the first few rows of trainer.train_dataset? Thanks! Also our Discord server can be more helpful for async help if that works!

Sep 14 '24 08:09 danielhanchen

Hi @danielhanchen , there are many rows, i've trimmed it to show the format

trainer.train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1835
})

trainer.train_dataset[0]

{'input_ids': [1,
  1595,
  83779,
  1877,
  18746,  
  ...],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  ...],
  'labels': [-100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  1877,
  82236,
  1321,
  14969,
  5978,
   ...]}

What other commands should I run?

Sep 15 '24 04:09 LostRuins

Ok I'll check on my end and get back to you asap!

Sep 15 '24 18:09 danielhanchen

did anyone find a fix?

Oct 06 '24 21:10 ghost

Sadly no.

Oct 07 '24 00:10 LostRuins

I am facing the same issue using train_on_responses_only with Qwen 2.5 7B, and the solution is using the DataCollatorForSeq2Seq as the data_collator as follows:

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = joined_dataset["train"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    ...

I found the above usage in the Llama 3.2 conversational notebook for the gradient accumulation fix. However, looks like training using this data_collator takes more than 4X longer per training step, I assume it is due to the padding happening. Currently it is way faster to train without using train_on_responses_only, at least on my 32k+ context use case.

Nov 02 '24 21:11 marcelodiaz558

Thanks @marcelodiaz558 very helpful response! Yeah using Seq2Seq is way too slow considering Unsloth should speed things up. I am currently at 0.01 it/s I'll try getting rid of the data_collator argument tomorrow

Nov 04 '24 08:11 darkness8i8

any solutions?

Nov 11 '24 23:11 nomadictuba2005

It's still very wonky, especially with newlines

Nov 12 '24 08:11 LostRuins

For fine-tuning models using train_on_responses_only, I achieved faster training results by enabling packing=True. I assume this is because the model processes fewer padding tokens when putting training samples together to reach the context length.

Nov 15 '24 02:11 marcelodiaz558

@marcelodiaz558 Packing = true causes significant quality loss due to the lack of proper attention masking in unsloth. (Unrelated instructions get falsely associated together)

I would not recommend packing until it's fixed.

Nov 15 '24 04:11 LostRuins

I'm running into this issue with Gemma 2

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (labels in this case) have excessive nesting (inputs type list where type int is expected).

Ran train_on_responses_only with:

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<bos><start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

My dataset was already truncated to lower than max_seq_length

Dec 01 '24 15:12 selalipop

I'm running into this issue with Gemma 2

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (labels in this case) have excessive nesting (inputs type list where type int is expected).

Ran train_on_responses_only with:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<bos><start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)
My dataset was already truncated to lower than max_seq_length

i had the exact same issue😁, any fixes?

Dec 01 '24 19:12 Jordinia

Hello ,

You should now be seeing improvements with the latest version of unsloth. Please make sure to upgrade your installation to the latest version using

pip install --upgrade unsloth-zoo
pip install --upgrade unsloth

Also checkout our mistral nemo example notebook here: https://github.com/unslothai/notebooks/blob/main/nb/Mistral_Nemo_(12B)-Alpaca.ipynb

will close this for now. Feel free to comment back with details and code that would help us reproduce the issue if you're still experiencing the same issue even after updating to the latest version. Thank you for using unsloth

Jun 29 '25 22:06 rolandtannous

@rolandtannous I'm still having issues with this. Here are my unsloth and unsloth_zoo installations:

pip show unsloth unsloth_zoo
Name: unsloth
Version: 2025.7.3
Summary: 2-5X faster LLM finetuning
Home-page: http://www.unsloth.ai
Author: Unsloth AI team
Author-email: [email protected]
License-Expression: Apache-2.0
Location: /home/matt/mambaforge/envs/us/lib/python3.11/site-packages
Requires: accelerate, bitsandbytes, datasets, diffusers, hf_transfer, huggingface_hub, numpy, packaging, peft, protobuf, psutil, sentencepiece, torch, torchvision, tqdm, transformers, triton, trl, tyro, unsloth_zoo, wheel, xformers
Required-by: 
---
Name: unsloth_zoo
Version: 2025.7.4
Summary: Utils for Unsloth
Home-page: http://www.unsloth.ai
Author: Unsloth AI team
Author-email: [email protected]
License-Expression: LGPL-3.0-or-later
Location: /home/matt/mambaforge/envs/us/lib/python3.11/site-packages
Requires: accelerate, cut_cross_entropy, datasets, hf_transfer, huggingface_hub, msgspec, numpy, packaging, peft, pillow, protobuf, psutil, regex, sentencepiece, torch, tqdm, transformers, triton, trl, typing_extensions, tyro, wheel
Required-by: unsloth

Here is the code I'm using to setup my data:

# Prepare datasets
if not skip_training:
    from datasets import Dataset, load_dataset, Split
    from unsloth import to_sharegpt, standardize_sharegpt, apply_chat_template

    print("Preparing datasets...")

    # Load datasets using load_dataset for consistency
    train_dataset = load_dataset('csv', data_files=train_csv_file, )
    val_dataset = load_dataset('csv', data_files=val_csv_file, )
    test_dataset = load_dataset('csv', data_files=test_csv_file, )

    datav2 = load_dataset('csv', data_files={
        'train': train_csv_file,
        'validation': val_csv_file,
        'test': test_csv_file,
    })

    # Convert to ShareGPT format
    train_sharegpt = to_sharegpt(
        datav2['train'],
        merged_prompt=merged_prompt_template,
        output_column_name=output_col,
    )

    val_sharegpt = to_sharegpt(
        datav2['validation'],
        merged_prompt=merged_prompt_template,
        output_column_name=output_col,
    )

    test_sharegpt = to_sharegpt(
        datav2['test'],
        merged_prompt=merged_prompt_template,
        output_column_name=output_col,
    )

    # Standardize ShareGPT format
    train_standardized = standardize_sharegpt(train_sharegpt)
    val_standardized = standardize_sharegpt(val_sharegpt)
    test_standardized = standardize_sharegpt(test_sharegpt)

    print("✓ Datasets converted to ShareGPT format")
else:
    print("⏭️  Skipping dataset preparation - using existing trained model")

# Apply chat template
if not skip_training:
    train_formatted = apply_chat_template(
        train_standardized,
        tokenizer=tokenizer,
        # chat_template=chat_template,
    )

    val_formatted = apply_chat_template(
        val_standardized,
        tokenizer=tokenizer,
        # chat_template=chat_template,
    )

    test_formatted = apply_chat_template(
        test_standardized,
        tokenizer=tokenizer,
        # chat_template=chat_template,
    )

    train_formatted = train_formatted.remove_columns(
        [col for col in train_formatted.column_names if col not in ["input_ids", "attention_mask", "labels", "text"]]
    )

    val_formatted = val_formatted.remove_columns(
        [col for col in val_formatted.column_names if col not in ["input_ids", "attention_mask", "labels", "text"]]
    )

    test_formatted = test_formatted.remove_columns(
        [col for col in test_formatted.column_names if col not in ["input_ids", "attention_mask", "labels", "text"]]
    )

    print("✓ Chat template applied")
    print(f"Train dataset size: {len(train_formatted)}")
    print(f"Val dataset size: {len(val_formatted)}")
    print(f"Test dataset size: {len(test_formatted)}")

    first = train_formatted[0]
    print(f"First training sample keys: {list(first.keys())}")

    for k,v in first.items():
        if isinstance(v, str):
            print(f"{k}: {v}\n\n (length: {len(v)})")
        else:
            print(f"{k}: {v} (type: {type(v)})")
else:
    print("⏭️  Skipping chat template application - using existing trained model")



# Setup trainer
if not skip_training:
    from trl import SFTTrainer
    from transformers import TrainingArguments, DataCollatorForSeq2Seq
    from unsloth import is_bfloat16_supported

    print("Setting up trainer...")

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_formatted,
        eval_dataset=val_formatted,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True, max_length=max_seq_length),
        packing=True,
        remove_unused_columns=True,
        args=TrainingArguments(
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            eval_strategy="steps",
            eval_steps=eval_steps,
            per_device_eval_batch_size=per_device_eval_batch_size,
            warmup_steps=warmup_steps,
            max_steps=num_steps,
            learning_rate=learning_rate,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=weight_decay,
            lr_scheduler_type="linear",
            seed=random_seed,
            output_dir=output_dir,
            logging_dir=log_dir,
            metric_for_best_model="eval_loss",
            save_strategy="steps",
            # save_steps=eval_steps * 2,  # Save less frequently than eval
            load_best_model_at_end=True,
            remove_unused_columns=False,
            # dataloader_num_workers=0,  # Avoid multiprocessing issues
            # report_to=None,  # Disable wandb/tensorboard for now
        ),
    )
    from unsloth.chat_templates import train_on_responses_only

    print(f'tokenizer input part: {tokenizer._unsloth_input_part}')
    print(f'tokenizer response part: {tokenizer._unsloth_output_part}')

    trainer = train_on_responses_only(
        trainer,
        # response_part="<|start_header_id|>assistant<|end_header_id|>",
        # instruction_part="<|start_header_id|>user<|end_header_id|>"
    )

    print("✓ Trainer configured")

else:
    print("⏭️  Skipping trainer setup - using existing trained model")

And I'm getting this error:

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

Also, I appreciate the notebook link but it doesnt use the train_on_responses_only usage.

Aug 06 '25 17:08 themantalope

unsloth
unsloth copied to clipboard

More problems with train_on_responses_only

unsloth unsloth copied to clipboard

More problems with train_on_responses_only

unsloth
unsloth copied to clipboard