unsloth icon indicating copy to clipboard operation
unsloth copied to clipboard

RAM Memory Usage (200GB+) While Fine-Tuning Mistral Model with UnslothTrainer

Open risqaliyevds opened this issue 8 months ago • 1 comments

Describe the issue
I’m attempting to fine-tune the Mistral model for Uzbek language using UnslothTrainer on a 100k dataset. Despite each example being well under 4096 tokens, the training process quickly consumes over 200GB of system RAM and eventually stops. I’d appreciate any guidance on how to reduce memory usage or properly configure the training process.


Environment & Observations

  • Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
  • Trainer: UnslothTrainer
  • GPU: (From screenshot) NVIDIA A100-SXM4-80GB (Driver version 535.113.01, CUDA 12.2)
  • CPU RAM usage goes beyond 200GB before training stops
  • Batch size: 96
  • Dataset size: 100,000 entries (each below 4096 tokens)

Below are screenshots showing the high memory consumption (htop output, nvidia-smi, etc.).


Code to Reproduce

import os
import json
import torch
from datasets import load_dataset, Dataset
from transformers import DataCollatorForLanguageModeling, Trainer
from unsloth import UnslothTrainer, UnslothTrainingArguments
from unsloth import FastLanguageModel, get_chat_template, is_bfloat16_supported
import math
import gc
from tqdm import tqdm
import tempfile
import random

def load_model_and_tokenizer(model_path, max_seq_length=4096, load_in_8bit=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=max_seq_length,
        dtype=None,  # None for auto detection
        load_in_4bit=True,
        load_in_8bit=load_in_8bit,
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="mistral",
        mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
        map_eos_token=True,
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def apply_lora(model, rank=128, lora_alpha=32):
    model = FastLanguageModel.get_peft_model(
        model,
        r=rank,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
            "embed_tokens", "lm_head",  # Include these for continual pretraining
        ],
        lora_alpha=lora_alpha,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=True,
        loftq_config=None,
    )
    return model

def formatting_prompts_func(tokenizer, example):
    prompt = tokenizer.apply_chat_template(example['text'], tokenize=False)
    token_count = len(tokenizer.encode(prompt))
    if token_count > 4096 - 256:
        return {"text": ""}
    return {"text": prompt}

def main():
    model_path = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
    model, tokenizer = load_model_and_tokenizer(model_path, 4096, False)
    model = apply_lora(model)

    dataset_path = "/home/mata/llm/data/mining/training/dataset_processed_2000-2000.json"
    with open(dataset_path, "r", encoding="utf-8") as file:
        dataset = json.load(file)[:100000]

    dataset = [{"text": example} for example in dataset]
    dataset = Dataset.from_list(dataset)

    dataset = dataset.map(
        lambda example: formatting_prompts_func(tokenizer, example),
        num_proc=16,
        batched=False
    )

    dataset = dataset.filter(lambda example: example["text"] != "", num_proc=16)

    trainer = UnslothTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        max_seq_length=4096,
        dataset_num_proc=16,
        args=UnslothTrainingArguments(
            per_device_train_batch_size=96,
            gradient_accumulation_steps=1,
            num_train_epochs=2,
            learning_rate=5e-5,
            embedding_learning_rate=1e-5,
            save_steps=500,
            save_strategy="steps",
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="/home/mata/llm/data/models/tuned/unsloth/Mistral-Nemo-Instruct-2407",
            report_to="tensorboard",
            logging_dir="/home/mata/llm/data/models/tuned/unsloth/Mistral-Nemo-Instruct-2407/logs",
        ),
    )

    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    trainer_stats = trainer.train()

if __name__ == "__main__":
    main()

Screenshots

Image

Image


Is it normal or i need do something? Thank you!

risqaliyevds avatar Mar 04 '25 09:03 risqaliyevds

Decrease per_device_train_batch_size - it's way too large!

danielhanchen avatar Mar 04 '25 12:03 danielhanchen