starcoder
starcoder copied to clipboard
Trying to fine tune starcoderbase model using finetuning.oy - multiple GPUs
I am trying to fine tune bigcode/starcoderbase model on compute A100 with 8 GPUs 80Gb VRAM. My initial steps are to adjust parameters. I get some impression that it becomes slow if I increase batch size from 1 to 32 with total 256. I can see the memory usage increases from 5Gb to 61Gb and I assume it utilizes more memory, but the progress bar does not come up for initial for more than 5 hours while if I use batch size 1 with total 16 for compute the progress comes and estimates 2.5 hours for my initial fine-tuning. I can see in both cases it alternates between GPUs with 100% utilization for any box at a time. It just goes way slower with batch size of 32. I assume it should be faster with bigger batch size, but impression is like when I increase batch size it feels like multiplying the training steps instead of increasing the speed.
Here's my code
import os
# map all available GPUs to the visible device list
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
os.environ["WANDB_DISABLED"] = "true"
import argparse
import os
import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, set_peft_model_state_dict
from torch.utils.data import IterableDataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from pynvml import *
def print_gpu_utilization():
nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(handle)
print(f"GPU memory occupied: {info.used//1024**2} MB.")
def print_summary(result):
print(f"Time: {result.metrics['train_runtime']:.2f}")
print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
print_gpu_utilization()
x
"""
Fine-Tune StarCoder on Private code
"""
class SavePeftModelCallback(TrainerCallback):
def on_save(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs,
):
checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
kwargs["model"].save_pretrained(checkpoint_folder)
pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
torch.save({}, pytorch_model_path)
return control
class LoadBestPeftModelCallback(TrainerCallback):
def on_train_end(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs,
):
print(f"Loading best peft model from {state.best_model_checkpoint} (score: {state.best_metric}).")
best_model_path = os.path.join(state.best_model_checkpoint, "adapter_model.bin")
adapters_weights = torch.load(best_model_path)
model = kwargs["model"]
set_peft_model_state_dict(model, adapters_weights)
return control
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default="bigcode/starcoderbase")
parser.add_argument("--training_data", type=str, default="training-data/training_data_12151.json")
parser.add_argument("--subset", type=str)
parser.add_argument("--split", type=str)
parser.add_argument("--size_valid_set", type=int, default=10000)
parser.add_argument("--streaming", action="store_true")
parser.add_argument("--shuffle_buffer", type=int, default=5000)
# parser.add_argument("--input_column_name", type=str, default="prompt")
# parser.add_argument("--output_column_name", type=str, default="completion")
parser.add_argument("--seq_length", type=int, default=2048)
parser.add_argument("--max_steps", type=int, default=10000)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--gradient_accumulation_steps", type=int, default=16)
parser.add_argument("--eos_token_id", type=int, default=49152)
parser.add_argument("--lora_r", type=int, default=16)
parser.add_argument("--lora_alpha", type=int, default=32)
parser.add_argument("--lora_dropout", type=float, default=0.05)
parser.add_argument("--learning_rate", type=float, default=5e-6)
parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
parser.add_argument("--num_warmup_steps", type=int, default=100)
parser.add_argument("--weight_decay", type=float, default=0.05)
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument("--no_fp16", action="store_false")
parser.add_argument("--bf16", action="store_true", default=True)
parser.add_argument("--no_gradient_checkpointing", action="store_false", default=False)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--num_workers", type=int, default=None)
parser.add_argument("--output_dir", type=str, default="./checkpoints")
parser.add_argument("--log_freq", default=100, type=int)
parser.add_argument("--eval_freq", default=100, type=int)
parser.add_argument("--save_freq", default=1000, type=int)
return parser.parse_args(args=[
"--max_steps", "200",
"--num_workers", "20",
"--seq_length", "1024",
"--batch_size", "1",
"--log_freq", "1",
"--num_warmup_steps", "100"
])
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
"""
Estimate the average number of characters per token in the dataset.
"""
total_characters, total_tokens = 0, 0
for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
text = prepare_sample_text(example)
total_characters += len(text)
if tokenizer.is_fast:
total_tokens += len(tokenizer(text).tokens())
else:
total_tokens += len(tokenizer.tokenize(text))
return total_characters / total_tokens
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
def prepare_sample_text(item):
"""Prepare the text from a sample of the dataset."""
metadata = item['metadata']
code_snippet = item['code_snippet']
prompt = f"Metadata: {metadata} | Code:"
text = f"{prompt} {code_snippet}"
return text
class CodeConstantLengthDataset(IterableDataset):
"""
Iterable dataset that returns constant length chunks of tokens from stream of text files.
Args:
tokenizer (Tokenizer): The processor used for proccessing the data.
dataset (dataset.Dataset): Dataset with text files.
infinite (bool): If True the iterator is reset after dataset reaches end else stops.
seq_length (int): Length of token sequences to return.
num_of_sequences (int): Number of token sequences to keep in buffer.
chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
"""
def __init__(
self,
tokenizer,
dataset,
infinite=False,
seq_length=1024,
num_of_sequences=1024,
chars_per_token=3.6
):
self.tokenizer = tokenizer
self.dataset = dataset
self.concat_token_id = tokenizer.eos_token_id
self.seq_length = seq_length
self.infinite = infinite
self.current_size = 0
self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
def __iter__(self):
iterator = iter(self.dataset)
more_examples = True
while more_examples:
buffer, buffer_len = [], 0
while True:
if buffer_len >= self.max_buffer_size:
break
try:
item = next(iterator)
buffer.append(prepare_sample_text(next(iterator)))
buffer_len += len(buffer[-1])
except StopIteration:
if self.infinite:
iterator = iter(self.dataset)
else:
more_examples = False
break
tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
all_token_ids = []
for tokenized_input in tokenized_inputs:
all_token_ids.extend(tokenized_input + [self.concat_token_id])
for i in range(0, len(all_token_ids), self.seq_length):
input_ids = all_token_ids[i : i + self.seq_length]
if len(input_ids) == self.seq_length:
self.current_size += 1
yield {
"input_ids": torch.LongTensor(input_ids),
"labels": torch.LongTensor(input_ids),
}
def create_datasets(tokenizer, args):
dataset = load_dataset('json', data_files=args.training_data)
# first shuffle dataset
dataset['train'] = dataset['train'].shuffle(seed=42)
# split dataset into train and test, note there is no test set, so we need to split train
dataset = dataset['train'].train_test_split(test_size=0.1)
train_data = dataset["train"]
valid_data = dataset["test"]
print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
chars_per_token = chars_token_ratio(train_data, tokenizer)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
train_dataset = CodeConstantLengthDataset(
tokenizer,
train_data,
infinite=True,
seq_length=args.seq_length,
chars_per_token=chars_per_token
)
valid_dataset = CodeConstantLengthDataset(
tokenizer,
valid_data,
infinite=False,
seq_length=args.seq_length,
chars_per_token=chars_per_token
)
return train_dataset, valid_dataset
args = get_args()
set_seed(args.seed)
os.makedirs(args.output_dir, exist_ok=True)
logging.set_verbosity_info()
tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_auth_token=True)
train_dataset, eval_dataset = create_datasets(tokenizer, args)
train_data = train_dataset
val_data = eval_dataset
print("Loading the model")
# disable caching mechanism when using gradient checkpointing
model = AutoModelForCausalLM.from_pretrained(
args.model_path,
use_auth_token=True,
use_cache=not args.no_gradient_checkpointing,
load_in_8bit=True,
device_map="auto"#{"": Accelerator().process_index},
)
model = prepare_model_for_int8_training(model)
lora_config = LoraConfig(
r=args.lora_r,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout,
bias="none",
task_type="CAUSAL_LM",
target_modules = ["c_proj", "c_attn", "q_attn"]
)
model = get_peft_model(model, lora_config)
print_trainable_parameters(model)
print("Starting main loop")
training_args = TrainingArguments(
output_dir=args.output_dir,
dataloader_drop_last=True,
evaluation_strategy="steps",
max_steps=args.max_steps,
eval_steps=args.eval_freq,
save_steps=args.save_freq,
logging_steps=args.log_freq,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
learning_rate=args.learning_rate,
lr_scheduler_type=args.lr_scheduler_type,
warmup_steps=args.num_warmup_steps,
gradient_accumulation_steps=args.gradient_accumulation_steps,
gradient_checkpointing=not args.no_gradient_checkpointing,
fp16=not args.no_fp16,
bf16=args.bf16,
weight_decay=args.weight_decay,
run_name="StarCoderBase-finetuned",
ddp_find_unused_parameters=False,
)
trainer = Trainer(model=model, args=training_args, train_dataset=train_data, eval_dataset=val_data, callbacks=[SavePeftModelCallback, LoadBestPeftModelCallback])
print("Training...")
result = trainer.train()
print_summary(result)
print("Saving last checkpoint of the model")
model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
Some of the screens when I start fine-tuning with batch size of 1:
Wondering if you have tried lower the gradient_accumulation_steps, with larger batch size, the accumulation steps can be smaller.
This is fine, as the progress bar displays the number of steps — and in your code, there is a fixed value for the number of steps.
One step utilizes number_of_gpus * batch_size * gradient_accumulation_steps
samples from dataset. So it is totally expected that increasing batch_size (as it's per device, not total) will make your steps longer. But the training will go faster — you'd just need fewer steps to get to the same loss.
Alternatively, you could go with fixed-size (not infinite) datasets and set max_steps=-1
, this way the trainer will compute the number of steps per epoch automatically (and you'll see that it decreases with increasing batch size or gradient accumulation steps or number of GPUs)
Also, you need to run your script using accelerate
or torchrun
to utilize multiple GPUs. Otherwise it would use just one GPU.
Hi, I'm using pretty much the same code (but with HuggingFaceH4/CodeAlpaca_20K
) and run it using torchrun --nproc_per_node=8
However, I get the following error:
ValueError: DistributedDataParallel device_ids and output_device arguments only work with single-device/multiple-device GPU modules or CPU modules, but got device_ids [7], output_device 7, and module parameters {device(type='cuda', index=0), device(type='cuda', index=1), device(type='cuda', index=2), device(type='cuda', index=3), device(type='cuda', index=4), device(type='cuda', index=5), device(type='cuda', index=6)}.
Does anyone happen to know where the problem might be?
I am trying to fine tune bigcode/starcoderbase model on compute A100 with 8 GPUs 80Gb VRAM. My initial steps are to adjust parameters. I get some impression that it becomes slow if I increase batch size from 1 to 32 with total 256. I can see the memory usage increases from 5Gb to 61Gb and I assume it utilizes more memory, but the progress bar does not come up for initial for more than 5 hours while if I use batch size 1 with total 16 for compute the progress comes and estimates 2.5 hours for my initial fine-tuning. I can see in both cases it alternates between GPUs with 100% utilization for any box at a time. It just goes way slower with batch size of 32. I assume it should be faster with bigger batch size, but impression is like when I increase batch size it feels like multiplying the training steps instead of increasing the speed.
Here's my code
import os # map all available GPUs to the visible device list os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7" os.environ["WANDB_DISABLED"] = "true" import argparse import os import torch from accelerate import Accelerator from datasets import load_dataset from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, set_peft_model_state_dict from torch.utils.data import IterableDataset from tqdm import tqdm from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR from pynvml import * def print_gpu_utilization(): nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) info = nvmlDeviceGetMemoryInfo(handle) print(f"GPU memory occupied: {info.used//1024**2} MB.") def print_summary(result): print(f"Time: {result.metrics['train_runtime']:.2f}") print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") print_gpu_utilization() x """ Fine-Tune StarCoder on Private code """ class SavePeftModelCallback(TrainerCallback): def on_save( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}") kwargs["model"].save_pretrained(checkpoint_folder) pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin") torch.save({}, pytorch_model_path) return control class LoadBestPeftModelCallback(TrainerCallback): def on_train_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): print(f"Loading best peft model from {state.best_model_checkpoint} (score: {state.best_metric}).") best_model_path = os.path.join(state.best_model_checkpoint, "adapter_model.bin") adapters_weights = torch.load(best_model_path) model = kwargs["model"] set_peft_model_state_dict(model, adapters_weights) return control def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, default="bigcode/starcoderbase") parser.add_argument("--training_data", type=str, default="training-data/training_data_12151.json") parser.add_argument("--subset", type=str) parser.add_argument("--split", type=str) parser.add_argument("--size_valid_set", type=int, default=10000) parser.add_argument("--streaming", action="store_true") parser.add_argument("--shuffle_buffer", type=int, default=5000) # parser.add_argument("--input_column_name", type=str, default="prompt") # parser.add_argument("--output_column_name", type=str, default="completion") parser.add_argument("--seq_length", type=int, default=2048) parser.add_argument("--max_steps", type=int, default=10000) parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--gradient_accumulation_steps", type=int, default=16) parser.add_argument("--eos_token_id", type=int, default=49152) parser.add_argument("--lora_r", type=int, default=16) parser.add_argument("--lora_alpha", type=int, default=32) parser.add_argument("--lora_dropout", type=float, default=0.05) parser.add_argument("--learning_rate", type=float, default=5e-6) parser.add_argument("--lr_scheduler_type", type=str, default="cosine") parser.add_argument("--num_warmup_steps", type=int, default=100) parser.add_argument("--weight_decay", type=float, default=0.05) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--no_fp16", action="store_false") parser.add_argument("--bf16", action="store_true", default=True) parser.add_argument("--no_gradient_checkpointing", action="store_false", default=False) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--num_workers", type=int, default=None) parser.add_argument("--output_dir", type=str, default="./checkpoints") parser.add_argument("--log_freq", default=100, type=int) parser.add_argument("--eval_freq", default=100, type=int) parser.add_argument("--save_freq", default=1000, type=int) return parser.parse_args(args=[ "--max_steps", "200", "--num_workers", "20", "--seq_length", "1024", "--batch_size", "1", "--log_freq", "1", "--num_warmup_steps", "100" ]) def chars_token_ratio(dataset, tokenizer, nb_examples=400): """ Estimate the average number of characters per token in the dataset. """ total_characters, total_tokens = 0, 0 for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): text = prepare_sample_text(example) total_characters += len(text) if tokenizer.is_fast: total_tokens += len(tokenizer(text).tokens()) else: total_tokens += len(tokenizer.tokenize(text)) return total_characters / total_tokens def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" ) def prepare_sample_text(item): """Prepare the text from a sample of the dataset.""" metadata = item['metadata'] code_snippet = item['code_snippet'] prompt = f"Metadata: {metadata} | Code:" text = f"{prompt} {code_snippet}" return text class CodeConstantLengthDataset(IterableDataset): """ Iterable dataset that returns constant length chunks of tokens from stream of text files. Args: tokenizer (Tokenizer): The processor used for proccessing the data. dataset (dataset.Dataset): Dataset with text files. infinite (bool): If True the iterator is reset after dataset reaches end else stops. seq_length (int): Length of token sequences to return. num_of_sequences (int): Number of token sequences to keep in buffer. chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer. """ def __init__( self, tokenizer, dataset, infinite=False, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6 ): self.tokenizer = tokenizer self.dataset = dataset self.concat_token_id = tokenizer.eos_token_id self.seq_length = seq_length self.infinite = infinite self.current_size = 0 self.max_buffer_size = seq_length * chars_per_token * num_of_sequences def __iter__(self): iterator = iter(self.dataset) more_examples = True while more_examples: buffer, buffer_len = [], 0 while True: if buffer_len >= self.max_buffer_size: break try: item = next(iterator) buffer.append(prepare_sample_text(next(iterator))) buffer_len += len(buffer[-1]) except StopIteration: if self.infinite: iterator = iter(self.dataset) else: more_examples = False break tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"] all_token_ids = [] for tokenized_input in tokenized_inputs: all_token_ids.extend(tokenized_input + [self.concat_token_id]) for i in range(0, len(all_token_ids), self.seq_length): input_ids = all_token_ids[i : i + self.seq_length] if len(input_ids) == self.seq_length: self.current_size += 1 yield { "input_ids": torch.LongTensor(input_ids), "labels": torch.LongTensor(input_ids), } def create_datasets(tokenizer, args): dataset = load_dataset('json', data_files=args.training_data) # first shuffle dataset dataset['train'] = dataset['train'].shuffle(seed=42) # split dataset into train and test, note there is no test set, so we need to split train dataset = dataset['train'].train_test_split(test_size=0.1) train_data = dataset["train"] valid_data = dataset["test"] print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}") chars_per_token = chars_token_ratio(train_data, tokenizer) print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") train_dataset = CodeConstantLengthDataset( tokenizer, train_data, infinite=True, seq_length=args.seq_length, chars_per_token=chars_per_token ) valid_dataset = CodeConstantLengthDataset( tokenizer, valid_data, infinite=False, seq_length=args.seq_length, chars_per_token=chars_per_token ) return train_dataset, valid_dataset args = get_args() set_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) logging.set_verbosity_info() tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_auth_token=True) train_dataset, eval_dataset = create_datasets(tokenizer, args) train_data = train_dataset val_data = eval_dataset print("Loading the model") # disable caching mechanism when using gradient checkpointing model = AutoModelForCausalLM.from_pretrained( args.model_path, use_auth_token=True, use_cache=not args.no_gradient_checkpointing, load_in_8bit=True, device_map="auto"#{"": Accelerator().process_index}, ) model = prepare_model_for_int8_training(model) lora_config = LoraConfig( r=args.lora_r, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout, bias="none", task_type="CAUSAL_LM", target_modules = ["c_proj", "c_attn", "q_attn"] ) model = get_peft_model(model, lora_config) print_trainable_parameters(model) print("Starting main loop") training_args = TrainingArguments( output_dir=args.output_dir, dataloader_drop_last=True, evaluation_strategy="steps", max_steps=args.max_steps, eval_steps=args.eval_freq, save_steps=args.save_freq, logging_steps=args.log_freq, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, warmup_steps=args.num_warmup_steps, gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_checkpointing=not args.no_gradient_checkpointing, fp16=not args.no_fp16, bf16=args.bf16, weight_decay=args.weight_decay, run_name="StarCoderBase-finetuned", ddp_find_unused_parameters=False, ) trainer = Trainer(model=model, args=training_args, train_dataset=train_data, eval_dataset=val_data, callbacks=[SavePeftModelCallback, LoadBestPeftModelCallback]) print("Training...") result = trainer.train() print_summary(result) print("Saving last checkpoint of the model") model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
Some of the screens when I start fine-tuning with batch size of 1:
![]()
![]()
![]()
hey,do you solve the problem?