adapters
adapters copied to clipboard
multi GPU setup causes error
Discussed in https://github.com/adapter-hub/adapters/discussions/649
Originally posted by moritzblum February 13, 2024 Hello all,
I'm using the AutoAdapterModel on a multi GPU set-up as follows:
import torch
from adapters import AutoAdapterModel, AdapterTrainer
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
if __name__ == "__main__":
dataset = load_dataset("yelp_review_full")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
model = AutoAdapterModel.from_pretrained("bert-base-cased")
model.add_adapter("rotten_tomatoes", config="seq_bn")
model.add_classification_head(
"rotten_tomatoes",
num_labels=5
)
model.train_adapter("rotten_tomatoes")
training_args = TrainingArguments(output_dir="test_trainer", report_to=None)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
print('start training')
trainer = AdapterTrainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
My packages:
transformers 4.37.2
adapters 0.1.1
torch 2.0.0+cu118
On one GPU everything works fine. When multiple GPUs are available, I get the following error:
Traceback (most recent call last):
File "/homes/mblum/blp/adapter_minimal_example.py", line 51, in <module>
trainer.train()
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 1539, in train
return inner_training_loop(
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 1869, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 2772, in training_step
loss = self.compute_loss(model, inputs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 2795, in compute_loss
outputs = model(**inputs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/_utils.py", line 644, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(*input, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/models/bert/adapter_model.py", line 69, in forward
outputs, context = self.bert(
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/context.py", line 116, in wrapper_func
results = f(self, *args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/model_mixin.py", line 1270, in forward
return super().forward(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 1013, in forward
encoder_outputs = self.encoder(
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 607, in forward
layer_outputs = layer_module(
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 539, in forward
layer_output = apply_chunking_to_forward(
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/pytorch_utils.py", line 236, in apply_chunking_to_forward
return forward_fn(*input_tensors)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 552, in feed_forward_chun
k
layer_output = self.output(intermediate_output, attention_output)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/models/bert/modeling_bert.py", line 157, in forward
hidden_states = self.bottleneck_layer_forward(hidden_states, input_tensor, self.LayerNorm)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/bottleneck.py", line 348, in bottleneck_layer_forward
state = self.compose(adapter_setup, state)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/adapter_layer_base.py", line 472, in compose
state = composition_func(adapter_setup, state, lvl=0)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/adapter_layer_base.py", line 308, in compose_stack
state = self.compose_single(adapter_stack_layer, state, lvl=lvl + 1)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/bottleneck.py", line 230, in compose_single
layer_output = adapter_layer(
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/modeling.py", line 172, in forward
down = self.adapter_down(x)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for arg
ument mat1 in method wrapper_CUDA_addmm)
Can anybody help me find the issue?