lightning-language-modeling KeyError: 'size' when running the model

Hi, I'm working on adapting this code to building a language model on a medical dataset. Before running the full model on the entire dataset, I'm running it on a sample. Specifically, I just get a single batch from my train_dataloader and pass that into the model to make sure I get a loss. Here is the code for my data module and model (they are pretty much the same as in the repo but I'm share it here for convenience):

class LMDataModule(pl.LightningDataModule):
  def __init__(self, model_name_or_path, train_file, val_file, line_by_line, pad_to_max_len, max_seq_len, mlm_prob, batch_size, overwrite_cache):
    self.train_file = train_file
    self.val_file = val_file
    self.model_name_or_path = model_name_or_path
    self.line_by_line = line_by_line
    self.pad_to_max_len = pad_to_max_len
    self.max_seq_len = max_seq_len
    self.mlm_prob = mlm_prob
    self.batch_size = batch_size
    self.overwrite_cache = overwrite_cache
  
  def setup(self, stage):
    tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
    data_files = {'train': self.train_file, 'val': self.val_file}

    datasets = load_dataset('text', data_files=data_files)
    padding = 'max_length' if self.pad_to_max_len else False
    
    if self.line_by_line:
      # When using line_by_line, we just tokenize each nonempty line.
      def tokenize_fn(ex):
        ex['text'] = [line for line in ex['text'] if len(line) > 0 and not line.isspace()]
        # We use this option because DataCollatorForLanguageModeling is more efficient when it receives the `special_tokens_mask`.        
        return tokenizer(ex['text'], padding=padding, truncation=True, max_length=self.max_seq_len, return_special_tokens_mask=True)
    else:
      # We tokenize every text, then concatenate them together before splitting them in smaller parts.
      def tokenize_fn(ex):
        return tokenizer(ex['text'], return_special_tokens_mask=True)

    tokenized_datasets = datasets.map(tokenize_fn, batched=True, remove_columns=['text'], load_from_cache_file=not self.overwrite_cache)
      
    if self.max_seq_len is None:
      self.max_seq_len = tokenizer.model_max_len
    else:
      if self.max_seq_len > tokenizer.model_max_length:
        logger.warn(f"The maximum sequence length ({self.max_seq_len}) is larger than the maximum length of the model ({tokenizer.model_max_length}). Using max_seq_len={tokenizer.model_max_length}")
      self.max_seq_len = min(self.max_seq_len, tokenizer.model_max_length)

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_len.
    def group_texts(examples):
      # Concatenate all texts
      concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}  
      total_len = len(concatenated_examples[list(examples.keys())[0]])
      #  We drop the small reminder, we could add padding if the model supported it instead of this drop
      total_len = (total_len // self.max_seq_len) * self.max_seq_len
      result = {
        k: [t[i: i + self.max_seq_len]
            for i in range(0, total_len, self.max_seq_len)] 
        for k, t in concatenated_examples.items()
      }  
      return result
    
    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower to preprocess.
    tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, load_from_cache_file=not self.overwrite_cache)    
  
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=self.mlm_prob)
    self.train_ds = tokenized_datasets['train']
    self.val_ds = tokenized_datasets['val']
    self.data_collator = data_collator
    
  def train_dataloader(self):
    return DataLoader(self.train_ds, batch_size=self.batch_size, collate_fn=self.data_collator)

  def val_dataloader(self):
    return DataLoader(self.val_ds, batch_size=self.batch_size, collate_fn=self.data_collator)

class LMModel(pl.LightningModule):
  def __init__(self, model_name_or_path, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
    super().__init__()
    self.save_hyperparameters()
    config = AutoConfig.from_pretrained(model_name_or_path, return_dict=True)
    self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path, config=config)
    
  def forward(self, x):
    return self.model(x).logits
  
  def training_step(self, batch, batch_idx):
    loss = self.model(**batch).loss
    self.log('train_loss', loss, on_epoch=True)
    return loss
  
  def validation_step(self, batch, batch_idx):
    loss = self.model(**batch).loss
    self.log('val_loss', loss, on_step=True, sync_dist=True)
    
  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), self.hparams.learning_rate, self.hparams.adam_beta1, self.hparams.adam_beta2, eps=self.hparams.adam_epsilon)
    return optimzier

Here is the code to run a single batch:

args = Namespace(batch_size=4,
  line_by_line=True,
  max_seq_len=32,
  mlm_prob=0.15,
  model_name_or_path='bert-base-uncased',
  overwrite_cache=True,
  pad_to_max_len=False,
  train_file='project_dir/sample.txt',
  val_file='project_dir/sample.txt',                 
)
dm = LMDataModule(**vars(args))
dm.setup(stage=None)
learning_rate = 5e-5
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_epsilon = 1e-8
model = LMModel(args.model_name_or_path, learning_rate, adam_beta1, adam_beta2, adam_epsilon)
train_dl = dm.train_dataloader()
itr = iter(train_dl)
batch = itr.next()
model(batch)

When I run the above code, I get the following error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/tokenization_utils_base.py in __getattr__(self, item)
    313         try:
--> 314             return self.data[item]
    315         except KeyError:

KeyError: 'size'

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
<ipython-input-41-bd0200007a4a> in <module>
----> 1 model(x)

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

<ipython-input-6-351e2f77fe3a> in forward(self, x)
      7 
      8   def forward(self, x):
----> 9     return self.model(x).logits
     10 
     11   def training_step(self, batch, batch_idx):

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)
   1313             output_attentions=output_attentions,
   1314             output_hidden_states=output_hidden_states,
-> 1315             return_dict=return_dict,
   1316         )
   1317 

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
    917             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
    918         elif input_ids is not None:
--> 919             input_shape = input_ids.size()
    920             batch_size, seq_length = input_shape
    921         elif inputs_embeds is not None:

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/tokenization_utils_base.py in __getattr__(self, item)
    314             return self.data[item]
    315         except KeyError:
--> 316             raise AttributeError
    317 
    318     def __getstate__(self):

AttributeError:

Interestingly, the error seems to occur at transformers/tokenization_utils_base.py. However, the LMDataModule runs without any problems. I'm trying to tackle the error, but I thought I'll share it here incase anyone else had a similar error and a solution to it.

Mar 20 '21 20:03 sudarshan85

hi, can you check if you're using the same version of libraries (e.g. transformers) as listed in requirements.txt?

Mar 21 '21 16:03 yang-zhang-sf

Here are the versions I use:

ptl.__version__ = '1.2.3'
datasets.__version__ = '1.4.1'
transformers.__version__ = '4.3.3'

hi, can you check if you're using the same version of libraries (e.g. transformers) as listed in requirements.txt?

From the above the answer is no. I might have to install the correct versions of at least transformers to see if it works. Do you think using the latest Pytorch Lightning version will also be a problem?

Also, I use the datasets library from HuggingFace and not tensorflow_datasets. Will that be problem (I've not encountered any till now on the data processing side).

Mar 21 '21 20:03 sudarshan85

I'd recommend starting from a new conda environment and pip install requirments.txt.

Mar 22 '21 01:03 yang-zhang-sf

I did the installs and I'm getting a new error from importing transformers:

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-279c49635b32> in <module>
----> 1 import transformers

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/__init__.py in <module>
    624 
    625     # Trainer
--> 626     from .trainer import Trainer
    627     from .trainer_pt_utils import torch_distributed_zero_first
    628 else:

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/trainer.py in <module>
     67     TrainerState,
     68 )
---> 69 from .trainer_pt_utils import (
     70     DistributedTensorGatherer,
     71     SequentialDistributedSampler,

/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/transformers/trainer_pt_utils.py in <module>
     38     SAVE_STATE_WARNING = ""
     39 else:
---> 40     from torch.optim.lr_scheduler import SAVE_STATE_WARNING
     41 
     42 logger = logging.get_logger(__name__)

ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler' (/net/vaosl01/opt/NFS/su0/anaconda3/envs/scatr/lib/python3.7/site-packages/torch/optim/lr_scheduler.py)

There is an issue for this, but I couldn't find any real solution. Is there a reason, why this code wouldn't run on the latest version of transformers?

Mar 23 '21 18:03 sudarshan85

Looking from the issue that it's related to "not compatible with PyTorch 1.4.0". Maybe try to install a different (newer) version of pytorch?

Mar 25 '21 02:03 yang-zhang

lightning-language-modeling lightning-language-modeling copied to clipboard

KeyError: 'size' when running the model

lightning-language-modeling
lightning-language-modeling copied to clipboard