Transformers-Tutorials icon indicating copy to clipboard operation
Transformers-Tutorials copied to clipboard

'DataLoader' object is not subscriptable. Issues with the dataset and dataloader?

Open tbetth01 opened this issue 2 years ago • 0 comments

I am having some issues adapting the tutorials to my fine tuning use case. I think it has something to do with the way I am reading in the data to train_ds and test_ds but cannot figure out what I need to change. Any idea?

#Concat everything together into a single dataframe
df = pd.concat([dfHealthy,dfUnHealthy,dfPublic])
df.rename(columns={'message': 'text', 'healthLabel': 'label'}, inplace=True)
#pair down the df
df = df[:250000]
train=df.sample(frac=0.8,random_state=200) #random state is a seed value
test=df.drop(train.index)
df.to_csv('all.csv',index=False)
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)
from datasets import load_dataset
from datasets import Features, Value, ClassLabel
#dataset = load_dataset('csv', data_files='test.csv',split='train')
#train_ds, test_ds = load_dataset('csv', data_files='test.csv',split=['train','test'])

class_names = ['good', 'bad']
ft = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})
train_ds = load_dataset('csv', data_files='train.csv',features=ft)
test_ds = load_dataset('csv', data_files='test.csv',features=ft)

import transformers
from transformers import PerceiverTokenizer, AutoTokenizer, AutoModelForMaskedLM, DataCollatorWithPadding

tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')
train_ds = train_ds.map(lambda examples: tokenizer(examples['text'],padding="max_length",max_length=2048), batched=True)
test_ds = test_ds.map(lambda examples: tokenizer(examples['text'],padding="max_length",max_length=2048), batched=True)
train_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_ds, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=4)
from transformers import PerceiverForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = PerceiverForSequenceClassification.from_pretrained("deepmind/language-perceiver",
                                                               num_labels=2,
                                                               id2label=id2label,
                                                               label2id=label2id)
model.to(device)
from transformers import AdamW
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(20):  # loop over the dataset multiple times
    print("Epoch:", epoch)
    for batch in tqdm(train_dataloader['train']):
        # get the inputs; 
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # evaluate
        predictions = outputs.logits.argmax(-1).cpu().detach().numpy()
        accuracy = accuracy_score(y_true=batch["label"].numpy(), y_pred=predictions)
        print(f"Loss: {loss.item()}, Accuracy: {accuracy}")
/opt/oss/conda3/lib/python3.7/site-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  FutureWarning,
Epoch: 0
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_9875/1757304568.py in <module>
      8 for epoch in range(20):  # loop over the dataset multiple times
      9     print("Epoch:", epoch)
---> 10     for batch in tqdm(train_dataloader['train']):
     11         # get the inputs;
     12         inputs = batch["input_ids"].to(device)

TypeError: 'DataLoader' object is not subscriptable

tbetth01 avatar May 27 '22 13:05 tbetth01