lightning-language-modeling
lightning-language-modeling copied to clipboard
KeyError: 'size' when running the model
Hi,
I'm working on adapting this code to building a language model on a medical dataset. Before running the full model on the entire dataset, I'm running it on a sample. Specifically, I just get a single batch from my train_dataloader
and pass that into the model to make sure I get a loss. Here is the code for my data module and model (they are pretty much the same as in the repo but I'm share it here for convenience):
class LMDataModule(pl.LightningDataModule):
def __init__(self, model_name_or_path, train_file, val_file, line_by_line, pad_to_max_len, max_seq_len, mlm_prob, batch_size, overwrite_cache):
self.train_file = train_file
self.val_file = val_file
self.model_name_or_path = model_name_or_path
self.line_by_line = line_by_line
self.pad_to_max_len = pad_to_max_len
self.max_seq_len = max_seq_len
self.mlm_prob = mlm_prob
self.batch_size = batch_size
self.overwrite_cache = overwrite_cache
def setup(self, stage):
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
data_files = {'train': self.train_file, 'val': self.val_file}
datasets = load_dataset('text', data_files=data_files)
padding = 'max_length' if self.pad_to_max_len else False
if self.line_by_line:
# When using line_by_line, we just tokenize each nonempty line.
def tokenize_fn(ex):
ex['text'] = [line for line in ex['text'] if len(line) > 0 and not line.isspace()]
# We use this option because DataCollatorForLanguageModeling is more efficient when it receives the `special_tokens_mask`.
return tokenizer(ex['text'], padding=padding, truncation=True, max_length=self.max_seq_len, return_special_tokens_mask=True)
else:
# We tokenize every text, then concatenate them together before splitting them in smaller parts.
def tokenize_fn(ex):
return tokenizer(ex['text'], return_special_tokens_mask=True)
tokenized_datasets = datasets.map(tokenize_fn, batched=True, remove_columns=['text'], load_from_cache_file=not self.overwrite_cache)
if self.max_seq_len is None:
self.max_seq_len = tokenizer.model_max_len
else:
if self.max_seq_len > tokenizer.model_max_length:
logger.warn(f"The maximum sequence length ({self.max_seq_len}) is larger than the maximum length of the model ({tokenizer.model_max_length}). Using max_seq_len={tokenizer.model_max_length}")
self.max_seq_len = min(self.max_seq_len, tokenizer.model_max_length)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_len.
def group_texts(examples):
# Concatenate all texts
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_len = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small reminder, we could add padding if the model supported it instead of this drop
total_len = (total_len // self.max_seq_len) * self.max_seq_len
result = {
k: [t[i: i + self.max_seq_len]
for i in range(0, total_len, self.max_seq_len)]
for k, t in concatenated_examples.items()
}
return result
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower to preprocess.
tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, load_from_cache_file=not self.overwrite_cache)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=self.mlm_prob)
self.train_ds = tokenized_datasets['train']
self.val_ds = tokenized_datasets['val']
self.data_collator = data_collator
def train_dataloader(self):
return DataLoader(self.train_ds, batch_size=self.batch_size, collate_fn=self.data_collator)
def val_dataloader(self):
return DataLoader(self.val_ds, batch_size=self.batch_size, collate_fn=self.data_collator)
class LMModel(pl.LightningModule):
def __init__(self, model_name_or_path, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
super().__init__()
self.save_hyperparameters()
config = AutoConfig.from_pretrained(model_name_or_path, return_dict=True)
self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path, config=config)
def forward(self, x):
return self.model(x).logits
def training_step(self, batch, batch_idx):
loss = self.model(**batch).loss
self.log('train_loss', loss, on_epoch=True)
return loss
def validation_step(self, batch, batch_idx):
loss = self.model(**batch).loss
self.log('val_loss', loss, on_step=True, sync_dist=True)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), self.hparams.learning_rate, self.hparams.adam_beta1, self.hparams.adam_beta2, eps=self.hparams.adam_epsilon)
return optimzier
Here is the code to run a single batch:
args = Namespace(batch_size=4,
line_by_line=True,
max_seq_len=32,
mlm_prob=0.15,
model_name_or_path='bert-base-uncased',
overwrite_cache=True,
pad_to_max_len=False,
train_file='project_dir/sample.txt',
val_file='project_dir/sample.txt',
)
dm = LMDataModule(**vars(args))
dm.setup(stage=None)
learning_rate = 5e-5
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_epsilon = 1e-8
model = LMModel(args.model_name_or_path, learning_rate, adam_beta1, adam_beta2, adam_epsilon)
train_dl = dm.train_dataloader()
itr = iter(train_dl)
batch = itr.next()
model(batch)
When I run the above code, I get the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/tokenization_utils_base.py in __getattr__(self, item)
313 try:
--> 314 return self.data[item]
315 except KeyError:
KeyError: 'size'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-41-bd0200007a4a> in <module>
----> 1 model(x)
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
<ipython-input-6-351e2f77fe3a> in forward(self, x)
7
8 def forward(self, x):
----> 9 return self.model(x).logits
10
11 def training_step(self, batch, batch_idx):
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)
1313 output_attentions=output_attentions,
1314 output_hidden_states=output_hidden_states,
-> 1315 return_dict=return_dict,
1316 )
1317
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
917 raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
918 elif input_ids is not None:
--> 919 input_shape = input_ids.size()
920 batch_size, seq_length = input_shape
921 elif inputs_embeds is not None:
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/tokenization_utils_base.py in __getattr__(self, item)
314 return self.data[item]
315 except KeyError:
--> 316 raise AttributeError
317
318 def __getstate__(self):
AttributeError:
Interestingly, the error seems to occur at transformers/tokenization_utils_base.py
. However, the LMDataModule
runs without any problems. I'm trying to tackle the error, but I thought I'll share it here incase anyone else had a similar error and a solution to it.
hi, can you check if you're using the same version of libraries (e.g. transformers) as listed in requirements.txt?
Here are the versions I use:
ptl.__version__ = '1.2.3'
datasets.__version__ = '1.4.1'
transformers.__version__ = '4.3.3'
hi, can you check if you're using the same version of libraries (e.g. transformers) as listed in requirements.txt?
From the above the answer is no. I might have to install the correct versions of at least transformers
to see if it works. Do you think using the latest Pytorch Lightning version will also be a problem?
Also, I use the datasets
library from HuggingFace and not tensorflow_datasets
. Will that be problem (I've not encountered any till now on the data processing side).
I'd recommend starting from a new conda environment and pip install requirments.txt
.
I did the installs and I'm getting a new error from importing transformers
:
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-2-279c49635b32> in <module>
----> 1 import transformers
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/__init__.py in <module>
624
625 # Trainer
--> 626 from .trainer import Trainer
627 from .trainer_pt_utils import torch_distributed_zero_first
628 else:
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/trainer.py in <module>
67 TrainerState,
68 )
---> 69 from .trainer_pt_utils import (
70 DistributedTensorGatherer,
71 SequentialDistributedSampler,
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/trainer_pt_utils.py in <module>
38 SAVE_STATE_WARNING = ""
39 else:
---> 40 from torch.optim.lr_scheduler import SAVE_STATE_WARNING
41
42 logger = logging.get_logger(__name__)
ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler' (/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/optim/lr_scheduler.py)
There is an issue for this, but I couldn't find any real solution. Is there a reason, why this code wouldn't run on the latest version of transformers
?
Looking from the issue that it's related to "not compatible with PyTorch 1.4.0". Maybe try to install a different (newer) version of pytorch?