Transformers-Tutorials
Transformers-Tutorials copied to clipboard
'DataLoader' object is not subscriptable. Issues with the dataset and dataloader?
I am having some issues adapting the tutorials to my fine tuning use case. I think it has something to do with the way I am reading in the data to train_ds and test_ds but cannot figure out what I need to change. Any idea?
#Concat everything together into a single dataframe
df = pd.concat([dfHealthy,dfUnHealthy,dfPublic])
df.rename(columns={'message': 'text', 'healthLabel': 'label'}, inplace=True)
#pair down the df
df = df[:250000]
train=df.sample(frac=0.8,random_state=200) #random state is a seed value
test=df.drop(train.index)
df.to_csv('all.csv',index=False)
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)
from datasets import load_dataset
from datasets import Features, Value, ClassLabel
#dataset = load_dataset('csv', data_files='test.csv',split='train')
#train_ds, test_ds = load_dataset('csv', data_files='test.csv',split=['train','test'])
class_names = ['good', 'bad']
ft = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})
train_ds = load_dataset('csv', data_files='train.csv',features=ft)
test_ds = load_dataset('csv', data_files='test.csv',features=ft)
import transformers
from transformers import PerceiverTokenizer, AutoTokenizer, AutoModelForMaskedLM, DataCollatorWithPadding
tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')
train_ds = train_ds.map(lambda examples: tokenizer(examples['text'],padding="max_length",max_length=2048), batched=True)
test_ds = test_ds.map(lambda examples: tokenizer(examples['text'],padding="max_length",max_length=2048), batched=True)
train_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_ds, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=4)
from transformers import PerceiverForSequenceClassification
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PerceiverForSequenceClassification.from_pretrained("deepmind/language-perceiver",
num_labels=2,
id2label=id2label,
label2id=label2id)
model.to(device)
from transformers import AdamW
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()
for epoch in range(20): # loop over the dataset multiple times
print("Epoch:", epoch)
for batch in tqdm(train_dataloader['train']):
# get the inputs;
inputs = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["label"].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs=inputs, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
# evaluate
predictions = outputs.logits.argmax(-1).cpu().detach().numpy()
accuracy = accuracy_score(y_true=batch["label"].numpy(), y_pred=predictions)
print(f"Loss: {loss.item()}, Accuracy: {accuracy}")
/opt/oss/conda3/lib/python3.7/site-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
FutureWarning,
Epoch: 0
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_9875/1757304568.py in <module>
8 for epoch in range(20): # loop over the dataset multiple times
9 print("Epoch:", epoch)
---> 10 for batch in tqdm(train_dataloader['train']):
11 # get the inputs;
12 inputs = batch["input_ids"].to(device)
TypeError: 'DataLoader' object is not subscriptable