simpletransformers
simpletransformers copied to clipboard
TypeError: an integer is required (got type NoneType)
I want to pretrain BERT on custom corpus but got this error,
Source Code from simpletransformers.language_modeling import LanguageModelingModel, LanguageModelingArgs from sklearn.model_selection import train_test_split import pandas as pd import tokenizers
orpus_path = "/kaggle/input/line_by_line_corpus.txt"
print('Training Tokenizer...') tokenizer = tokenizers.BertWordPieceTokenizer() tokenizer.train( files=[corpus_path], vocab_size=50000, min_frequency=2, limit_alphabet=1000 ) tokenizer.save("/kaggle/working/tokenizer.json", pretty=True)
model_args = LanguageModelingArgs( tokenizer_name='/kaggle/working/tokenizer.json', overwrite_output_dir = True, save_steps = 10_000, )
model = LanguageModelingModel( "bert", None, args=model_args, train_files=corpus_path
model.train_model(corpus_path)
Enviroment
- Kaggle with P100 GPU
Error!!
TypeError Traceback (most recent call last)
/tmp/ipykernel_23/3446545645.py in
/opt/conda/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_model.py in train_model(self, train_file, output_dir, show_running_loss, args, eval_file, verbose, **kwargs) 468 eval_file=eval_file, 469 verbose=verbose, --> 470 **kwargs, 471 ) 472
/opt/conda/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_model.py in train(self, train_dataset, output_dir, show_running_loss, eval_file, verbose, **kwargs) 785 mininterval=0, 786 ) --> 787 for step, batch in enumerate(batch_iterator): 788 if steps_trained_in_current_epoch > 0: 789 steps_trained_in_current_epoch -= 1
/opt/conda/lib/python3.7/site-packages/tqdm/notebook.py in iter(self) 257 try: 258 it = super(tqdm_notebook, self).iter() --> 259 for obj in it: 260 # return super(tqdm...) will not catch exception 261 yield obj
/opt/conda/lib/python3.7/site-packages/tqdm/std.py in iter(self) 1193 1194 try: -> 1195 for obj in iterable: 1196 yield obj 1197 # Update and possibly print the progressbar.
/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in next(self) 626 # TODO(https://github.com/pytorch/pytorch/issues/76750)%3C/span%3E) 627 self._reset() # type: ignore[call-arg] --> 628 data = self._next_data() 629 self._num_yielded += 1 630 if self._dataset_kind == _DatasetKind.Iterable and \
/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self) 669 def _next_data(self): 670 index = self._next_index() # may raise StopIteration --> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration 672 if self._pin_memory: 673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index) 56 data = self.dataset.getitems(possibly_batched_index) 57 else: ---> 58 data = [self.dataset[idx] for idx in possibly_batched_index] 59 else: 60 data = self.dataset[possibly_batched_index]
/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in
/opt/conda/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_utils.py in getitem(self, item) 210 211 def getitem(self, item): --> 212 return torch.tensor(self.examples[item], dtype=torch.long) 213 214
TypeError: an integer is required (got type NoneType)
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.