evaluate
evaluate copied to clipboard
`perplexity` yields `IndexError: index out of range in self`
Small snippet to reproduce the problem:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import evaluate
from datasets import load_dataset
os.environ["TOKENIZERS_PARALLELISM"] = "false"
perplexity = evaluate.load("perplexity", module_type="metric")
print('Loading dataset ..')
dataset = load_dataset("JeanKaddour/minipile", "default")
input_texts = dataset["test"]["text"][:10]
print('Computing metrics ..')
results = perplexity.compute(model_id='gpt2',
add_start_token=False,
predictions=input_texts)
print(round(results["mean_perplexity"], 0))
print(round(results["perplexities"][0], 0))
Here is the full trace:
Traceback (most recent call last):
File "./snippets/bug.py", line 18, in <module>
results = perplexity.compute(model_id='gpt2',
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/evaluate/module.py", line 462, in compute
output = self._compute(**inputs, **compute_kwargs)
File "/Users/pasquale/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--perplexity/8ab643ad86f568b7d1d5f7822373fa7401ff5ff0297ccf114b0ca6a33be96bc0/perplexity.py", line 179, in _compute
out_logits = model(encoded_batch, attention_mask=attn_mask).logits
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1074, in forward
transformer_outputs = self.transformer(
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 838, in forward
position_embeds = self.wpe(position_ids)
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/modules/sparse.py", line 162, in forward
return F.embedding(
File "/Users/pasquale/anaconda3/lib/python3.8/site-packages/torch/nn/functional.py", line 2233, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
Setting max_length
fixes this
Shall we set max_length
to getattr(tokenizer, "model_max_length", getattr(tokenizer, "max_len", None))
automatically if it's None
or "auto"
?