adapters icon indicating copy to clipboard operation
adapters copied to clipboard

multi GPU setup causes error

Open moritzblum opened this issue 1 year ago • 0 comments

Discussed in https://github.com/adapter-hub/adapters/discussions/649

Originally posted by moritzblum February 13, 2024 Hello all,

I'm using the AutoAdapterModel on a multi GPU set-up as follows:

import torch
from adapters import AutoAdapterModel, AdapterTrainer
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate


if __name__ == "__main__":

    dataset = load_dataset("yelp_review_full")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

    model = AutoAdapterModel.from_pretrained("bert-base-cased")
    model.add_adapter("rotten_tomatoes", config="seq_bn")

    model.add_classification_head(
        "rotten_tomatoes",
        num_labels=5
    )

    model.train_adapter("rotten_tomatoes")

    training_args = TrainingArguments(output_dir="test_trainer", report_to=None)

    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    print('start training')
    trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

My packages:

transformers             4.37.2
adapters                 0.1.1
torch                    2.0.0+cu118

On one GPU everything works fine. When multiple GPUs are available, I get the following error:

Traceback (most recent call last):                                                                                                             
  File "/homes/mblum/blp/adapter_minimal_example.py", line 51, in <module>                                                                     
    trainer.train()                                                                                                                            
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 1539, in train                             
    return inner_training_loop(                                                                                                                
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 1869, in _inner_training_loop              
    tr_loss_step = self.training_step(model, inputs)                                                                                           
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 2772, in training_step                     
    loss = self.compute_loss(model, inputs)                                                                                                    
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/trainer.py", line 2795, in compute_loss                      
    outputs = model(**inputs)                                                                                                                  
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl                     
    return forward_call(*args, **kwargs)                                                                                                       
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
    output.reraise()
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/_utils.py", line 644, in reraise
    raise exception

RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/models/bert/adapter_model.py", line 69, in forward
    outputs, context = self.bert(
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/context.py", line 116, in wrapper_func
    results = f(self, *args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/model_mixin.py", line 1270, in forward
    return super().forward(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 1013, in forward
    encoder_outputs = self.encoder( 
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 607, in forward
    layer_outputs = layer_module(
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 539, in forward
    layer_output = apply_chunking_to_forward(
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/pytorch_utils.py", line 236, in apply_chunking_to_forward
    return forward_fn(*input_tensors)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 552, in feed_forward_chun
k
    layer_output = self.output(intermediate_output, attention_output)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/models/bert/modeling_bert.py", line 157, in forward
    hidden_states = self.bottleneck_layer_forward(hidden_states, input_tensor, self.LayerNorm)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/bottleneck.py", line 348, in bottleneck_layer_forward
    state = self.compose(adapter_setup, state)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/adapter_layer_base.py", line 472, in compose
    state = composition_func(adapter_setup, state, lvl=0)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/adapter_layer_base.py", line 308, in compose_stack
    state = self.compose_single(adapter_stack_layer, state, lvl=lvl + 1)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/bottleneck.py", line 230, in compose_single
    layer_output = adapter_layer(
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/adapters/methods/modeling.py", line 172, in forward
    down = self.adapter_down(x)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/container.py", line 217, in forward
    input = module(input)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/homes/mblum/miniconda3/envs/lp/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for arg
ument mat1 in method wrapper_CUDA_addmm)

Can anybody help me find the issue?

moritzblum avatar Feb 15 '24 10:02 moritzblum