No difference in performance or speed between different adapter configs
Environment info
adaptersversion: 0.1.2transformersversion: 4.36.2- Platform: Linux-5.8.0-63-generic-x86_64-with-glibc2.29
- Python version: 3.8.10
- Huggingface_hub version: 0.22.0
- Safetensors version: 0.4.2
- Accelerate version: 0.28.0
- Accelerate config: not found
- PyTorch version (GPU?): 2.0.0+cu117 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: Yes
- Using distributed or parallel set-up in script?: No
Information
Model I am using (Bert, XLNet ...): XLM-RoBERTa
Language I am using the model on (English, Chinese ...): ["en", "et", "ht", "id", "it", "qu", "sw", "ta", "th", "tr", "vi", "zh"]
Adapter setup I am using (if any): ["lora", "seq_bn", "seq_bn_inv", "double_seq_bn", "double_seq_bn_inv"]
The problem arises when using:
- [x] the official example scripts: (give details below)
- [x] my own modified scripts: (give details below)
The tasks I am working on is:
- [x] an official GLUE/SQUaD task: COPA + XCOPA
- [ ] my own task or dataset: (give details below)
To reproduce
Steps to reproduce the behavior:
Please run the following code. It's a modification of the 04_Cross_Lingual_Transfer notebook. When I run this script, I get the exact same stats and performance on a language, no matter which adapter configuration I choose for the language adapter. As far as I can tell, everything is the same; accuracy, training and eval losses, training and eval samples per second. I thought it might be because there's some caching going on the background or something but I've added del model, garbage collected, and deleted the torch cache at the end of the loop but it doesn't help.
These are the main modifications I've made:
- Convert the notebook to a
.pyscript - Iterate through all combinations of languages and adapter configs listed above
- Modify the
compute_accuracy()function so that it also records the accuracy on individual test set items. (This was done for a uni project)
#-*- coding: utf-8 -*-
from transformers import TrainingArguments, AutoTokenizer, AutoConfig, enable_full_determinism, EvalPrediction
from datasets import load_dataset, concatenate_datasets
from adapters.composition import Stack
from adapters import AutoAdapterModel, AdapterConfig, AdapterTrainer
import csv
import numpy as np
from itertools import product
from functools import partial
import os
import gc
import torch
def encode_batch(examples):
"""Encodes a batch of input data using the model tokenizer."""
all_encoded = {"input_ids": [], "attention_mask": []}
# Iterate through all examples in this batch
for premise, question, choice1, choice2 in zip(examples["premise"], examples["question"], examples["choice1"], examples["choice2"]):
sentences_a = [premise + " " + question for _ in range(2)]
# Both answer choices are passed in an array according to the format needed for the multiple-choice prediction head
sentences_b = [choice1, choice2]
encoded = tokenizer(
sentences_a,
sentences_b,
max_length=60,
truncation=True,
padding="max_length",
)
all_encoded["input_ids"].append(encoded["input_ids"])
all_encoded["attention_mask"].append(encoded["attention_mask"])
return all_encoded
def preprocess_dataset(dataset):
# Encode the input data
dataset = dataset.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
dataset = dataset.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
dataset.set_format(columns=["input_ids", "attention_mask", "labels"])
return dataset
def compute_accuracy(p: EvalPrediction, stats):
preds = np.argmax(p.predictions, axis=1)
for idx, (pred, label) in enumerate(zip(preds, p.label_ids)):
stats["index"].append(idx)
stats["prediction"].append(pred)
stats["label"].append(label)
stats["accuracy"].append(int(pred == label))
return {"acc": (preds == p.label_ids).mean()}
if __name__ == "__main__":
model_id = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_config = AutoConfig.from_pretrained(model_id)
dataset_en = load_dataset("super_glue", "copa")
dataset_en = preprocess_dataset(dataset_en)
train_dataset = concatenate_datasets([dataset_en["train"], dataset_en["validation"]])
seeds = [1, 2]
epochs = [8, 16]
langs = ["et", "ht", "id", "it", "qu", "sw", "ta", "th", "tr", "vi", "zh"]
adapter_configs = []
for inverse, double in product([True, False], [True, False]):
adapter_configs.append(f"{'double_' if double else ''}seq_bn{'_inv' if inverse else ''}")
adapter_configs.append("lora")
for seed, num_epochs, trg_lang, adapter_config in product(seeds, epochs, langs, adapter_configs):
print("\n\n\n\n===============")
print("seed:", seed)
print("num_epochs:", num_epochs)
print("trg_lang:", trg_lang)
print("adapter_config:", adapter_config)
lora = adapter_config == "lora"
inverse = "inv" in adapter_config
double = "double" in adapter_config
enable_full_determinism(seed=seed)
model = AutoAdapterModel.from_pretrained(
model_id,
config=model_config,
device_map="auto",
)
# Load the language adapters
lang_adapter_config = AdapterConfig.load(adapter_config, reduction_factor=2)
model.load_adapter("en/wiki@ukp", config=lang_adapter_config)
model.load_adapter(f"{trg_lang}/wiki@ukp", config=lang_adapter_config)
# Add a new task adapter
if lora:
# model.add_adapter("copa", config="lora")
# model.add_adapter("copa", config=lang_adapter_config)
model.add_adapter("copa") # this is the only one of the three that works. the others throw the following error: ValueError: Invalid adapter setup: str is not a valid adapter name or composition block.
else:
# standard Houlsby architecture (this is the default config)
model.add_adapter("copa")
# Add a classification head for our target task
model.add_multiple_choice_head("copa", num_choices=2)
model.train_adapter(["copa"])
# Unfreeze and activate stack setup
model.active_adapters = Stack("en", "copa")
training_args = TrainingArguments(
learning_rate=1e-4,
num_train_epochs=num_epochs,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
logging_steps=100,
output_dir="./training_output/adapters",
overwrite_output_dir=True,
# The next line is important to ensure the dataset labels are properly passed to the model
remove_unused_columns=False,
)
trainer = AdapterTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
train_output = trainer.train()
print(train_output)
train_samples_per_second = train_output.metrics["train_samples_per_second"]
dataset_trg_lang = load_dataset("xcopa", trg_lang, verification_mode="no_checks")
dataset_trg_lang = preprocess_dataset(dataset_trg_lang)
# print(dataset_trg_lang["test"][0])
model.active_adapters = Stack(trg_lang, "copa")
stats = {
"dataset": [],
"system": [],
"num_epochs": [],
"seed": [],
"index": [],
"target_language": [],
"lang_double_adapter": [],
"lang_inverse_adapter": [],
"lang_lora_adapter": [],
# "task_lora_adapter": [],
"prediction": [],
"label": [],
"accuracy": [],
"overall_eval_accuracy": [],
"train_samples_per_second": [],
"eval_samples_per_second": [],
}
eval_trainer = AdapterTrainer(
model=model,
args=TrainingArguments(output_dir="./eval_output/adapters", remove_unused_columns=False,),
eval_dataset=dataset_trg_lang["test"],
compute_metrics=partial(compute_accuracy, stats=stats),
)
eval_stats = eval_trainer.evaluate()
print(eval_stats)
num_datapoints = len(stats["index"])
stats["dataset"] = ["xcopa"] * num_datapoints
stats["system"] = ["adapters"] * num_datapoints
stats["num_epochs"] = [num_epochs] * num_datapoints
stats["seed"] = [seed] * num_datapoints
stats["target_language"] = [trg_lang] * num_datapoints
stats["lang_double_adapter"] = [int(double)] * num_datapoints
stats["lang_inverse_adapter"] = [int(inverse)] * num_datapoints
stats["lang_lora_adapter"] = [int(lora)] * num_datapoints
# stats["task_lora_adapter"] = ["lora" if lora else "houlsby"] * num_datapoints
stats["overall_eval_accuracy"] = [eval_stats["eval_acc"]] * num_datapoints
stats["train_samples_per_second"] = [train_samples_per_second] * num_datapoints
stats["eval_samples_per_second"] = [eval_stats["eval_samples_per_second"]] * num_datapoints
for k, v in stats.items():
assert len(v) == num_datapoints, f"\nThis column doesn't have the right amount of data:\nk:{k}\nv:\n{v}"
filename = f"data/xcopa/{trg_lang}_{adapter_config}_{num_epochs}eps_seed{seed}.csv"
directory = os.path.dirname(filename)
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(directory)
with open(filename, "w") as f:
writer = csv.writer(f)
writer.writerow(stats.keys())
writer.writerows(zip(*stats.values()))
del model
gc.collect()
torch.cuda.empty_cache()
Expected behavior
I expected to observe differences in performance or speed between the different adapter configs.