starcoder icon indicating copy to clipboard operation
starcoder copied to clipboard

Trying to fine tune starcoderbase model using finetuning.oy - multiple GPUs

Open dimichgh opened this issue 1 year ago • 5 comments

I am trying to fine tune bigcode/starcoderbase model on compute A100 with 8 GPUs 80Gb VRAM. My initial steps are to adjust parameters. I get some impression that it becomes slow if I increase batch size from 1 to 32 with total 256. I can see the memory usage increases from 5Gb to 61Gb and I assume it utilizes more memory, but the progress bar does not come up for initial for more than 5 hours while if I use batch size 1 with total 16 for compute the progress comes and estimates 2.5 hours for my initial fine-tuning. I can see in both cases it alternates between GPUs with 100% utilization for any box at a time. It just goes way slower with batch size of 32. I assume it should be faster with bigger batch size, but impression is like when I increase batch size it feels like multiplying the training steps instead of increasing the speed.

Here's my code

import os  
# map all available GPUs to the visible device list
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
os.environ["WANDB_DISABLED"] = "true"

import argparse
import os

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, set_peft_model_state_dict
from torch.utils.data import IterableDataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

x
"""
Fine-Tune StarCoder on Private code
"""

class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        kwargs["model"].save_pretrained(checkpoint_folder)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        torch.save({}, pytorch_model_path)
        return control


class LoadBestPeftModelCallback(TrainerCallback):
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        print(f"Loading best peft model from {state.best_model_checkpoint} (score: {state.best_metric}).")
        best_model_path = os.path.join(state.best_model_checkpoint, "adapter_model.bin")
        adapters_weights = torch.load(best_model_path)
        model = kwargs["model"]
        set_peft_model_state_dict(model, adapters_weights)
        return control
    

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="bigcode/starcoderbase")
    parser.add_argument("--training_data", type=str, default="training-data/training_data_12151.json")
    parser.add_argument("--subset", type=str)
    parser.add_argument("--split", type=str)
    parser.add_argument("--size_valid_set", type=int, default=10000)
    parser.add_argument("--streaming", action="store_true")
    parser.add_argument("--shuffle_buffer", type=int, default=5000)

    # parser.add_argument("--input_column_name", type=str, default="prompt")
    # parser.add_argument("--output_column_name", type=str, default="completion")

    parser.add_argument("--seq_length", type=int, default=2048)
    parser.add_argument("--max_steps", type=int, default=10000)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=16)
    parser.add_argument("--eos_token_id", type=int, default=49152)

    parser.add_argument("--lora_r", type=int, default=16)
    parser.add_argument("--lora_alpha", type=int, default=32)
    parser.add_argument("--lora_dropout", type=float, default=0.05)

    parser.add_argument("--learning_rate", type=float, default=5e-6)
    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
    parser.add_argument("--num_warmup_steps", type=int, default=100)
    parser.add_argument("--weight_decay", type=float, default=0.05)

    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--no_fp16", action="store_false")
    parser.add_argument("--bf16", action="store_true", default=True)
    parser.add_argument("--no_gradient_checkpointing", action="store_false", default=False)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--num_workers", type=int, default=None)
    parser.add_argument("--output_dir", type=str, default="./checkpoints")
    parser.add_argument("--log_freq", default=100, type=int)
    parser.add_argument("--eval_freq", default=100, type=int)
    parser.add_argument("--save_freq", default=1000, type=int)

    return parser.parse_args(args=[
        "--max_steps", "200",
        "--num_workers", "20",
        "--seq_length", "1024",
        "--batch_size", "1",
        "--log_freq", "1",
        "--num_warmup_steps", "100"     
    ])

def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def prepare_sample_text(item):
    """Prepare the text from a sample of the dataset."""
    metadata = item['metadata']  
    code_snippet = item['code_snippet'] 

    prompt = f"Metadata: {metadata} | Code:"  
    text = f"{prompt} {code_snippet}"
    return text

class CodeConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6
    ):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.concat_token_id = tokenizer.eos_token_id
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    item = next(iterator)
                    buffer.append(prepare_sample_text(next(iterator)))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

def create_datasets(tokenizer, args):
    dataset = load_dataset('json', data_files=args.training_data)
    # first shuffle dataset
    dataset['train'] = dataset['train'].shuffle(seed=42)
    # split dataset into train and test, note there is no test set, so we need to split train
    dataset = dataset['train'].train_test_split(test_size=0.1)

    train_data = dataset["train"]
    valid_data = dataset["test"]

    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = CodeConstantLengthDataset(
        tokenizer,
        train_data,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token
    )
    valid_dataset = CodeConstantLengthDataset(
        tokenizer,
        valid_data,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token
    )
    return train_dataset, valid_dataset

args = get_args()
set_seed(args.seed)
os.makedirs(args.output_dir, exist_ok=True)
logging.set_verbosity_info()
tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_auth_token=True)
train_dataset, eval_dataset = create_datasets(tokenizer, args)
train_data = train_dataset
val_data = eval_dataset

print("Loading the model")
# disable caching mechanism when using gradient checkpointing
model = AutoModelForCausalLM.from_pretrained(
    args.model_path,
    use_auth_token=True,
    use_cache=not args.no_gradient_checkpointing,
    load_in_8bit=True,
    device_map="auto"#{"": Accelerator().process_index},
)
model = prepare_model_for_int8_training(model)

lora_config = LoraConfig(
    r=args.lora_r,
    lora_alpha=args.lora_alpha,
    lora_dropout=args.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["c_proj", "c_attn", "q_attn"]
)

model = get_peft_model(model, lora_config)

print_trainable_parameters(model)

print("Starting main loop")

training_args = TrainingArguments(
    output_dir=args.output_dir,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    max_steps=args.max_steps,
    eval_steps=args.eval_freq,
    save_steps=args.save_freq,
    logging_steps=args.log_freq,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    learning_rate=args.learning_rate,
    lr_scheduler_type=args.lr_scheduler_type,
    warmup_steps=args.num_warmup_steps,
    gradient_accumulation_steps=args.gradient_accumulation_steps,
    gradient_checkpointing=not args.no_gradient_checkpointing,
    fp16=not args.no_fp16,
    bf16=args.bf16,
    weight_decay=args.weight_decay,
    run_name="StarCoderBase-finetuned",
    ddp_find_unused_parameters=False,
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_data, eval_dataset=val_data, callbacks=[SavePeftModelCallback, LoadBestPeftModelCallback])

print("Training...")
result = trainer.train()
print_summary(result)

print("Saving last checkpoint of the model")
model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))

Some of the screens when I start fine-tuning with batch size of 1:

Screenshot 2023-05-26 at 10 34 09 AM Screenshot 2023-05-26 at 10 34 18 AM Screenshot 2023-05-26 at 10 34 38 AM Screenshot 2023-05-26 at 10 25 01 AM

dimichgh avatar May 26 '23 19:05 dimichgh

Wondering if you have tried lower the gradient_accumulation_steps, with larger batch size, the accumulation steps can be smaller.

ywen666 avatar May 29 '23 18:05 ywen666

This is fine, as the progress bar displays the number of steps — and in your code, there is a fixed value for the number of steps.

One step utilizes number_of_gpus * batch_size * gradient_accumulation_steps samples from dataset. So it is totally expected that increasing batch_size (as it's per device, not total) will make your steps longer. But the training will go faster — you'd just need fewer steps to get to the same loss.

Alternatively, you could go with fixed-size (not infinite) datasets and set max_steps=-1, this way the trainer will compute the number of steps per epoch automatically (and you'll see that it decreases with increasing batch size or gradient accumulation steps or number of GPUs)

xpl avatar May 30 '23 18:05 xpl

Also, you need to run your script using accelerate or torchrun to utilize multiple GPUs. Otherwise it would use just one GPU.

xpl avatar May 30 '23 18:05 xpl

Hi, I'm using pretty much the same code (but with HuggingFaceH4/CodeAlpaca_20K) and run it using torchrun --nproc_per_node=8

However, I get the following error: ValueError: DistributedDataParallel device_ids and output_device arguments only work with single-device/multiple-device GPU modules or CPU modules, but got device_ids [7], output_device 7, and module parameters {device(type='cuda', index=0), device(type='cuda', index=1), device(type='cuda', index=2), device(type='cuda', index=3), device(type='cuda', index=4), device(type='cuda', index=5), device(type='cuda', index=6)}.

Does anyone happen to know where the problem might be?

acforvs avatar Jun 14 '23 07:06 acforvs

I am trying to fine tune bigcode/starcoderbase model on compute A100 with 8 GPUs 80Gb VRAM. My initial steps are to adjust parameters. I get some impression that it becomes slow if I increase batch size from 1 to 32 with total 256. I can see the memory usage increases from 5Gb to 61Gb and I assume it utilizes more memory, but the progress bar does not come up for initial for more than 5 hours while if I use batch size 1 with total 16 for compute the progress comes and estimates 2.5 hours for my initial fine-tuning. I can see in both cases it alternates between GPUs with 100% utilization for any box at a time. It just goes way slower with batch size of 32. I assume it should be faster with bigger batch size, but impression is like when I increase batch size it feels like multiplying the training steps instead of increasing the speed.

Here's my code

import os  
# map all available GPUs to the visible device list
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
os.environ["WANDB_DISABLED"] = "true"

import argparse
import os

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, set_peft_model_state_dict
from torch.utils.data import IterableDataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

x
"""
Fine-Tune StarCoder on Private code
"""

class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        kwargs["model"].save_pretrained(checkpoint_folder)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        torch.save({}, pytorch_model_path)
        return control


class LoadBestPeftModelCallback(TrainerCallback):
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        print(f"Loading best peft model from {state.best_model_checkpoint} (score: {state.best_metric}).")
        best_model_path = os.path.join(state.best_model_checkpoint, "adapter_model.bin")
        adapters_weights = torch.load(best_model_path)
        model = kwargs["model"]
        set_peft_model_state_dict(model, adapters_weights)
        return control
    

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="bigcode/starcoderbase")
    parser.add_argument("--training_data", type=str, default="training-data/training_data_12151.json")
    parser.add_argument("--subset", type=str)
    parser.add_argument("--split", type=str)
    parser.add_argument("--size_valid_set", type=int, default=10000)
    parser.add_argument("--streaming", action="store_true")
    parser.add_argument("--shuffle_buffer", type=int, default=5000)

    # parser.add_argument("--input_column_name", type=str, default="prompt")
    # parser.add_argument("--output_column_name", type=str, default="completion")

    parser.add_argument("--seq_length", type=int, default=2048)
    parser.add_argument("--max_steps", type=int, default=10000)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=16)
    parser.add_argument("--eos_token_id", type=int, default=49152)

    parser.add_argument("--lora_r", type=int, default=16)
    parser.add_argument("--lora_alpha", type=int, default=32)
    parser.add_argument("--lora_dropout", type=float, default=0.05)

    parser.add_argument("--learning_rate", type=float, default=5e-6)
    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
    parser.add_argument("--num_warmup_steps", type=int, default=100)
    parser.add_argument("--weight_decay", type=float, default=0.05)

    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--no_fp16", action="store_false")
    parser.add_argument("--bf16", action="store_true", default=True)
    parser.add_argument("--no_gradient_checkpointing", action="store_false", default=False)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--num_workers", type=int, default=None)
    parser.add_argument("--output_dir", type=str, default="./checkpoints")
    parser.add_argument("--log_freq", default=100, type=int)
    parser.add_argument("--eval_freq", default=100, type=int)
    parser.add_argument("--save_freq", default=1000, type=int)

    return parser.parse_args(args=[
        "--max_steps", "200",
        "--num_workers", "20",
        "--seq_length", "1024",
        "--batch_size", "1",
        "--log_freq", "1",
        "--num_warmup_steps", "100"     
    ])

def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def prepare_sample_text(item):
    """Prepare the text from a sample of the dataset."""
    metadata = item['metadata']  
    code_snippet = item['code_snippet'] 

    prompt = f"Metadata: {metadata} | Code:"  
    text = f"{prompt} {code_snippet}"
    return text

class CodeConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6
    ):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.concat_token_id = tokenizer.eos_token_id
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    item = next(iterator)
                    buffer.append(prepare_sample_text(next(iterator)))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

def create_datasets(tokenizer, args):
    dataset = load_dataset('json', data_files=args.training_data)
    # first shuffle dataset
    dataset['train'] = dataset['train'].shuffle(seed=42)
    # split dataset into train and test, note there is no test set, so we need to split train
    dataset = dataset['train'].train_test_split(test_size=0.1)

    train_data = dataset["train"]
    valid_data = dataset["test"]

    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = CodeConstantLengthDataset(
        tokenizer,
        train_data,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token
    )
    valid_dataset = CodeConstantLengthDataset(
        tokenizer,
        valid_data,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token
    )
    return train_dataset, valid_dataset

args = get_args()
set_seed(args.seed)
os.makedirs(args.output_dir, exist_ok=True)
logging.set_verbosity_info()
tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_auth_token=True)
train_dataset, eval_dataset = create_datasets(tokenizer, args)
train_data = train_dataset
val_data = eval_dataset

print("Loading the model")
# disable caching mechanism when using gradient checkpointing
model = AutoModelForCausalLM.from_pretrained(
    args.model_path,
    use_auth_token=True,
    use_cache=not args.no_gradient_checkpointing,
    load_in_8bit=True,
    device_map="auto"#{"": Accelerator().process_index},
)
model = prepare_model_for_int8_training(model)

lora_config = LoraConfig(
    r=args.lora_r,
    lora_alpha=args.lora_alpha,
    lora_dropout=args.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["c_proj", "c_attn", "q_attn"]
)

model = get_peft_model(model, lora_config)

print_trainable_parameters(model)

print("Starting main loop")

training_args = TrainingArguments(
    output_dir=args.output_dir,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    max_steps=args.max_steps,
    eval_steps=args.eval_freq,
    save_steps=args.save_freq,
    logging_steps=args.log_freq,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    learning_rate=args.learning_rate,
    lr_scheduler_type=args.lr_scheduler_type,
    warmup_steps=args.num_warmup_steps,
    gradient_accumulation_steps=args.gradient_accumulation_steps,
    gradient_checkpointing=not args.no_gradient_checkpointing,
    fp16=not args.no_fp16,
    bf16=args.bf16,
    weight_decay=args.weight_decay,
    run_name="StarCoderBase-finetuned",
    ddp_find_unused_parameters=False,
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_data, eval_dataset=val_data, callbacks=[SavePeftModelCallback, LoadBestPeftModelCallback])

print("Training...")
result = trainer.train()
print_summary(result)

print("Saving last checkpoint of the model")
model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))

Some of the screens when I start fine-tuning with batch size of 1:

Screenshot 2023-05-26 at 10 34 09 AM Screenshot 2023-05-26 at 10 34 18 AM Screenshot 2023-05-26 at 10 34 38 AM Screenshot 2023-05-26 at 10 25 01 AM

hey,do you solve the problem?

CEfanmin avatar Oct 13 '23 09:10 CEfanmin