unsloth
unsloth copied to clipboard
More problems with train_on_responses_only
I'm trying to finetune Mistral-Nemo-Base-2407 with a text dataset of long inputs. Usually, the SFTrainer will truncate it to fit the specified context size.
However, I get an error when using train_on_responses_only.
Running the same dataset without train_on_responses_only works fine and trains normally.
==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1
\\ /| Num examples = 2,068 | Num Epochs = 1
O^O/ \_/ \ Batch size per device = 2 | Gradient Accumulation steps = 4
\ / Total batch size = 8 | Total steps = 258
"-____-" Number of trainable parameters = 57,016,320
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:762, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
761 if not is_tensor(value):
--> 762 tensor = as_tensor(value)
764 # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
765 # # at-least2d
766 # if tensor.ndim > 2:
767 # tensor = tensor.squeeze(0)
768 # elif tensor.ndim < 2:
769 # tensor = tensor[None, :]
File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:724, in BatchEncoding.convert_to_tensors.<locals>.as_tensor(value, dtype)
723 return torch.tensor(np.array(value))
--> 724 return torch.tensor(value)
ValueError: expected sequence of length 4096 at dim 1 (got 327)
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In[1], line 268
258 trainer = train_on_responses_only(
259 trainer,
260 instruction_part = "### Instruction:\n",
261 response_part = "### Response:\n",
262 )
264 # #sanity check
265 # space = tokenizer(" ", add_special_tokens = False).input_ids[0]
266 # tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]])
--> 268 trainer_stats = trainer.train()
270 used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
271 used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
File <string>:145, in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
File <string>:320, in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
File /usr/local/lib/python3.10/dist-packages/accelerate/data_loader.py:550, in DataLoaderShard.__iter__(self)
548 # We iterate one batch ahead to check when we are at the end
549 try:
--> 550 current_batch = next(dataloader_iter)
551 except StopIteration:
552 yield
File /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
672 def _next_data(self):
673 index = self._next_index() # may raise StopIteration
--> 674 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
675 if self._pin_memory:
676 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)
File /usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py:45, in DataCollatorMixin.__call__(self, features, return_tensors)
43 return self.tf_call(features)
44 elif return_tensors == "pt":
---> 45 return self.torch_call(features)
46 elif return_tensors == "np":
47 return self.numpy_call(features)
File /usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py:806, in DataCollatorForLanguageModeling.torch_call(self, examples)
803 def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
804 # Handle dict or lists with proper padding and conversion to tensor.
805 if isinstance(examples[0], Mapping):
--> 806 batch = pad_without_fast_tokenizer_warning(
807 self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
808 )
809 else:
810 batch = {
811 "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
812 }
File /usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py:66, in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
63 tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
65 try:
---> 66 padded = tokenizer.pad(*pad_args, **pad_kwargs)
67 finally:
68 # Restore the state of the warning.
69 tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state
File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:3560, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3557 batch_outputs[key] = []
3558 batch_outputs[key].append(value)
-> 3560 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:227, in BatchEncoding.__init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
223 n_sequences = encoding[0].n_sequences
225 self._n_sequences = n_sequences
--> 227 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
File /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:778, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
773 if key == "overflowing_tokens":
774 raise ValueError(
775 "Unable to create tensor returning overflowing tokens of different lengths. "
776 "Please see if a fast version of this tokenizer is available to have this feature available."
777 ) from e
--> 778 raise ValueError(
779 "Unable to create tensor, you should probably activate truncation and/or padding with"
780 " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
781 f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
782 " expected)."
783 ) from e
785 return self
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Any help would be appreciated.
@LostRuins Apologies on the delay - it seems like it's saying the labels are nested? Would it be possible to print out maybe the first few rows of trainer.train_dataset? Thanks! Also our Discord server can be more helpful for async help if that works!
Hi @danielhanchen , there are many rows, i've trimmed it to show the format
trainer.train_dataset
Dataset({
features: ['input_ids', 'attention_mask', 'labels'],
num_rows: 1835
})
trainer.train_dataset[0]
{'input_ids': [1,
1595,
83779,
1877,
18746,
...],
'attention_mask': [1,
1,
1,
1,
1,
...],
'labels': [-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
-100,
1877,
82236,
1321,
14969,
5978,
...]}
What other commands should I run?
Ok I'll check on my end and get back to you asap!
did anyone find a fix?
Sadly no.
I am facing the same issue using train_on_responses_only with Qwen 2.5 7B, and the solution is using the DataCollatorForSeq2Seq as the data_collator as follows:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = joined_dataset["train"],
dataset_text_field = "text",
max_seq_length = max_seq_length,
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
...
I found the above usage in the Llama 3.2 conversational notebook for the gradient accumulation fix. However, looks like training using this data_collator takes more than 4X longer per training step, I assume it is due to the padding happening. Currently it is way faster to train without using train_on_responses_only, at least on my 32k+ context use case.
Thanks @marcelodiaz558 very helpful response! Yeah using Seq2Seq is way too slow considering Unsloth should speed things up. I am currently at 0.01 it/s I'll try getting rid of the data_collator argument tomorrow
any solutions?
It's still very wonky, especially with newlines
For fine-tuning models using train_on_responses_only, I achieved faster training results by enabling packing=True. I assume this is because the model processes fewer padding tokens when putting training samples together to reach the context length.
@marcelodiaz558 Packing = true causes significant quality loss due to the lack of proper attention masking in unsloth. (Unrelated instructions get falsely associated together)
I would not recommend packing until it's fixed.
I'm running into this issue with Gemma 2
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (
labelsin this case) have excessive nesting (inputs typelistwhere typeintis expected).
Ran train_on_responses_only with:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part = "<bos><start_of_turn>user\n",
response_part = "<start_of_turn>model\n",
)
My dataset was already truncated to lower than max_seq_length
I'm running into this issue with Gemma 2
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (
labelsin this case) have excessive nesting (inputs typelistwhere typeintis expected).Ran
train_on_responses_onlywith:from unsloth.chat_templates import train_on_responses_only trainer = train_on_responses_only( trainer, instruction_part = "<bos><start_of_turn>user\n", response_part = "<start_of_turn>model\n", )My dataset was already truncated to lower than
max_seq_length
i had the exact same issue😁, any fixes?
Hello ,
You should now be seeing improvements with the latest version of unsloth. Please make sure to upgrade your installation to the latest version using
pip install --upgrade unsloth-zoo
pip install --upgrade unsloth
Also checkout our mistral nemo example notebook here: https://github.com/unslothai/notebooks/blob/main/nb/Mistral_Nemo_(12B)-Alpaca.ipynb
will close this for now. Feel free to comment back with details and code that would help us reproduce the issue if you're still experiencing the same issue even after updating to the latest version. Thank you for using unsloth
@rolandtannous I'm still having issues with this. Here are my unsloth and unsloth_zoo installations:
pip show unsloth unsloth_zoo
Name: unsloth
Version: 2025.7.3
Summary: 2-5X faster LLM finetuning
Home-page: http://www.unsloth.ai
Author: Unsloth AI team
Author-email: [email protected]
License-Expression: Apache-2.0
Location: /home/matt/mambaforge/envs/us/lib/python3.11/site-packages
Requires: accelerate, bitsandbytes, datasets, diffusers, hf_transfer, huggingface_hub, numpy, packaging, peft, protobuf, psutil, sentencepiece, torch, torchvision, tqdm, transformers, triton, trl, tyro, unsloth_zoo, wheel, xformers
Required-by:
---
Name: unsloth_zoo
Version: 2025.7.4
Summary: Utils for Unsloth
Home-page: http://www.unsloth.ai
Author: Unsloth AI team
Author-email: [email protected]
License-Expression: LGPL-3.0-or-later
Location: /home/matt/mambaforge/envs/us/lib/python3.11/site-packages
Requires: accelerate, cut_cross_entropy, datasets, hf_transfer, huggingface_hub, msgspec, numpy, packaging, peft, pillow, protobuf, psutil, regex, sentencepiece, torch, tqdm, transformers, triton, trl, typing_extensions, tyro, wheel
Required-by: unsloth
Here is the code I'm using to setup my data:
# Prepare datasets
if not skip_training:
from datasets import Dataset, load_dataset, Split
from unsloth import to_sharegpt, standardize_sharegpt, apply_chat_template
print("Preparing datasets...")
# Load datasets using load_dataset for consistency
train_dataset = load_dataset('csv', data_files=train_csv_file, )
val_dataset = load_dataset('csv', data_files=val_csv_file, )
test_dataset = load_dataset('csv', data_files=test_csv_file, )
datav2 = load_dataset('csv', data_files={
'train': train_csv_file,
'validation': val_csv_file,
'test': test_csv_file,
})
# Convert to ShareGPT format
train_sharegpt = to_sharegpt(
datav2['train'],
merged_prompt=merged_prompt_template,
output_column_name=output_col,
)
val_sharegpt = to_sharegpt(
datav2['validation'],
merged_prompt=merged_prompt_template,
output_column_name=output_col,
)
test_sharegpt = to_sharegpt(
datav2['test'],
merged_prompt=merged_prompt_template,
output_column_name=output_col,
)
# Standardize ShareGPT format
train_standardized = standardize_sharegpt(train_sharegpt)
val_standardized = standardize_sharegpt(val_sharegpt)
test_standardized = standardize_sharegpt(test_sharegpt)
print("✓ Datasets converted to ShareGPT format")
else:
print("⏭️ Skipping dataset preparation - using existing trained model")
# Apply chat template
if not skip_training:
train_formatted = apply_chat_template(
train_standardized,
tokenizer=tokenizer,
# chat_template=chat_template,
)
val_formatted = apply_chat_template(
val_standardized,
tokenizer=tokenizer,
# chat_template=chat_template,
)
test_formatted = apply_chat_template(
test_standardized,
tokenizer=tokenizer,
# chat_template=chat_template,
)
train_formatted = train_formatted.remove_columns(
[col for col in train_formatted.column_names if col not in ["input_ids", "attention_mask", "labels", "text"]]
)
val_formatted = val_formatted.remove_columns(
[col for col in val_formatted.column_names if col not in ["input_ids", "attention_mask", "labels", "text"]]
)
test_formatted = test_formatted.remove_columns(
[col for col in test_formatted.column_names if col not in ["input_ids", "attention_mask", "labels", "text"]]
)
print("✓ Chat template applied")
print(f"Train dataset size: {len(train_formatted)}")
print(f"Val dataset size: {len(val_formatted)}")
print(f"Test dataset size: {len(test_formatted)}")
first = train_formatted[0]
print(f"First training sample keys: {list(first.keys())}")
for k,v in first.items():
if isinstance(v, str):
print(f"{k}: {v}\n\n (length: {len(v)})")
else:
print(f"{k}: {v} (type: {type(v)})")
else:
print("⏭️ Skipping chat template application - using existing trained model")
# Setup trainer
if not skip_training:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
print("Setting up trainer...")
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_formatted,
eval_dataset=val_formatted,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True, max_length=max_seq_length),
packing=True,
remove_unused_columns=True,
args=TrainingArguments(
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
eval_strategy="steps",
eval_steps=eval_steps,
per_device_eval_batch_size=per_device_eval_batch_size,
warmup_steps=warmup_steps,
max_steps=num_steps,
learning_rate=learning_rate,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=weight_decay,
lr_scheduler_type="linear",
seed=random_seed,
output_dir=output_dir,
logging_dir=log_dir,
metric_for_best_model="eval_loss",
save_strategy="steps",
# save_steps=eval_steps * 2, # Save less frequently than eval
load_best_model_at_end=True,
remove_unused_columns=False,
# dataloader_num_workers=0, # Avoid multiprocessing issues
# report_to=None, # Disable wandb/tensorboard for now
),
)
from unsloth.chat_templates import train_on_responses_only
print(f'tokenizer input part: {tokenizer._unsloth_input_part}')
print(f'tokenizer response part: {tokenizer._unsloth_output_part}')
trainer = train_on_responses_only(
trainer,
# response_part="<|start_header_id|>assistant<|end_header_id|>",
# instruction_part="<|start_header_id|>user<|end_header_id|>"
)
print("✓ Trainer configured")
else:
print("⏭️ Skipping trainer setup - using existing trained model")
And I'm getting this error:
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Also, I appreciate the notebook link but it doesnt use the train_on_responses_only usage.