Describe the issue

I am facing error's with DataParallel.

Sep 17 '24 17:09 yash3056

Hi @yash3056 please describe your issue in detail and provide the code and steps to reproduce it.

Sep 19 '24 05:09 alexsin368

@yash3056

The DP should be not fully supported by XPU for now. May I know why the DP is needed in your case, instead of DDP? I remember, DP will be obsoleted by PyTorch on GPU.

Sep 19 '24 07:09 gujinghui

@yash3056

The DP should be not fully supported by XPU for now. May I know why the DP is needed in your case, instead of DDP? I remember, DP will be obsoleted by PyTorch on GPU.

I wanted confirmation that DP is not supported. I am also facing error with DDP
I am facing engine error with XPU

Here is the code in which I am facing engine error

%%

#!pip install accelerate==1.0.0rc1 datasets

%%

from datasets import load_dataset from transformers import BertTokenizer, BertForSequenceClassification, AdamW from torch.utils.data import DataLoader from accelerate import Accelerator import torch from sklearn.metrics import accuracy_score

Load IMDB dataset

dataset = load_dataset("imdb")

Initialize the BERT tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Tokenize the data

def tokenize_function(examples): return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

Tokenize the train and test dataset

tokenized_datasets = dataset.map(tokenize_function, batched=True) tokenized_datasets.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

train_dataset = tokenized_datasets['train'] test_dataset = tokenized_datasets['test']

Define DataLoader for batching

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=8)

Load pre-trained BERT model with a classification head

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Optimizer

optimizer = AdamW(model.parameters(), lr=5e-5)

Initialize Accelerator

accelerator = Accelerator() device = accelerator.device print(device)

Move model and optimizer to the appropriate device

model, optimizer, train_dataloader, test_dataloader = accelerator.prepare( model, optimizer, train_dataloader, test_dataloader )

%%

from tqdm.auto import tqdm

def train(model, dataloader, optimizer, accelerator): model.train() total_loss = 0

# Use tqdm for progress bar
loop = tqdm(dataloader, leave=True, desc="Training")

for batch in loop:
    # Forward pass
    outputs = model(**batch)
    loss = outputs.loss

    # Backward pass
    accelerator.backward(loss)

    # Optimization step
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss.item()

    # Update tqdm description with the current loss
    loop.set_description(f"Training Loss: {loss.item():.4f}")

avg_loss = total_loss / len(dataloader)
return avg_loss

%%

def evaluate(model, dataloader, accelerator): model.eval() predictions, labels = [], []

# Use tqdm for progress bar
loop = tqdm(dataloader, leave=True, desc="Evaluating")

with torch.no_grad():
    for batch in loop:
        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        predictions.extend(accelerator.gather(preds).cpu().numpy())
        labels.extend(accelerator.gather(batch['labels']).cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(labels, predictions)
return accuracy

%%

def train(model, dataloader, optimizer, accelerator): model.train() total_loss = 0

# Use tqdm for progress bar
loop = tqdm(dataloader, leave=True, desc="Training")

for batch in loop:
    # Forward pass
    # Only pass input_ids and attention_mask to the model
    outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
    loss = outputs.loss

    # Backward pass
    accelerator.backward(loss)

    # Optimization step
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss.item()

    # Update tqdm description with the current loss
    loop.set_description(f"Training Loss: {loss.item():.4f}")

avg_loss = total_loss / len(dataloader)
return avg_loss

%%

epochs = 3

for epoch in range(epochs): # Train the model avg_train_loss = train(model, train_dataloader, optimizer, accelerator)

# Evaluate the model
accuracy = evaluate(model, test_dataloader, accelerator)

print(f"Epoch {epoch+1}/{epochs}")
print(f"Training Loss: {avg_train_loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

here is the error

[WARNING] Failed to create Level Zero tracer: 2013265921 { "name": "RuntimeError", "message": "could not create an engine", "stack": "--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[6], line 5 1 epochs = 3 3 for epoch in range(epochs): 4 # Train the model ----> 5 avg_train_loss = train(model, train_dataloader, optimizer, accelerator) 7 # Evaluate the model 8 accuracy = evaluate(model, test_dataloader, accelerator)

Cell In[5], line 11, in train(model, dataloader, optimizer, accelerator) 6 loop = tqdm(dataloader, leave=True, desc="Training") 8 for batch in loop: 9 # Forward pass 10 # Only pass input_ids and attention_mask to the model ---> 11 outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label']) 12 loss = outputs.loss 14 # Backward pass