peft
peft copied to clipboard
Unable to train StabilityLM using peft
I have tried to train stabilityLM via stabilityai through PEFT. Tried multiple configs with streaming data, single file, small batch sizes. But it always run into GPU out of memory issue. Currently using 48GB RTX A6000.
`import os os.environ["CUDA_VISIBLE_DEVICES"]="0" import torch import torch.nn as nn import bitsandbytes as bnb from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-tuned-alpha-3b") tokenizer.pad_token = tokenizer.eos_token
import transformers from datasets import load_dataset data = load_dataset("aashay96/indic_language_corpus",data_files=['indic_dataset_extracted/data/as/as.txt']) data = data.map(lambda samples: tokenizer(samples['text'],truncation=True,max_length=4096), batched=True)
model = AutoModelForCausalLM.from_pretrained( "stabilityai/stablelm-tuned-alpha-3b", load_in_8bit=True, device_map='auto', )
for param in model.parameters(): param.requires_grad = False # freeze the model - train adapters later if param.ndim == 1: # cast the small parameters (e.g. layernorm) to fp32 for stability param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable() # reduce number of stored activations model.enable_input_require_grads()
class CastOutputToFloat(nn.Sequential): def forward(self, x): return super().forward(x).to(torch.float32) model.embed_out = CastOutputToFloat(model.embed_out)
def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" )
from peft import LoraConfig, get_peft_model
config = LoraConfig( r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" )
model = get_peft_model(model, config) print_trainable_parameters(model)
import transformers from datasets import load_dataset #data = load_dataset("Abirate/english_quotes") #data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
trainer = transformers.Trainer( model=model, train_dataset=data['train'], args=transformers.TrainingArguments( per_device_train_batch_size=4, gradient_accumulation_steps=4, warmup_steps=100, save_steps=1000, num_train_epochs=3, #max_steps=20000, learning_rate=2e-4, fp16=True, logging_steps=1, output_dir='outputs', ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) ) model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train()
`
Error -
OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB (GPU 0; 47.54 GiB total capacity; 29.90 GiB already allocated; 5.18 GiB free; 41.20 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF