transformers
transformers copied to clipboard
Trainer failing during _save_checkpoint "cannot pickle '_thread.lock' object" with skip_memory_metrics=True
System Info
-
transformers
version: 4.28.1 - Platform: Linux-5.19.0-40-generic-x86_64-with-glibc2.35
- Python version: 3.9.13
- Huggingface_hub version: 0.13.4
- Safetensors version: not installed
- PyTorch version (GPU?): 2.0.0+cu117 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: True
- Using distributed or parallel set-up in script?: False
Who can help?
@sgugger
Information
- [ ] The official example scripts
- [X] My own modified scripts
Tasks
- [ ] An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - [X] My own task or dataset (give details below)
Reproduction
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(classes)).to('cuda')
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
optim="adamw_torch",
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
no_cuda=False,
skip_memory_metrics=True
)
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
Produces the following error:
TypeError Traceback (most recent call last)
/tmp/ipykernel_54606/4032920361.py in <module>
----> 1 trainer.train()
~/anaconda3/lib/python3.9/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1660 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1661 )
-> 1662 return inner_training_loop(
1663 args=args,
1664 resume_from_checkpoint=resume_from_checkpoint,
~/anaconda3/lib/python3.9/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2019
2020 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-> 2021 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
2022
2023 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
~/anaconda3/lib/python3.9/site-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
2289
2290 if self.control.should_save:
-> 2291 self._save_checkpoint(model, trial, metrics=metrics)
2292 self.control = self.callback_handler.on_save(self.args, self.state, self.control)
2293
~/anaconda3/lib/python3.9/site-packages/transformers/trainer.py in _save_checkpoint(self, model, trial, metrics)
2405 # Save the Trainer state
2406 if self.args.should_save:
-> 2407 self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
2408
2409 # Save RNG state in non-distributed training
~/anaconda3/lib/python3.9/site-packages/transformers/trainer_callback.py in save_to_json(self, json_path)
95 def save_to_json(self, json_path: str):
96 """Save the content of this instance in JSON format inside `json_path`."""
---> 97 json_string = json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
98 with open(json_path, "w", encoding="utf-8") as f:
99 f.write(json_string)
~/anaconda3/lib/python3.9/dataclasses.py in asdict(obj, dict_factory)
1073 if not _is_dataclass_instance(obj):
1074 raise TypeError("asdict() should be called on dataclass instances")
-> 1075 return _asdict_inner(obj, dict_factory)
1076
1077
~/anaconda3/lib/python3.9/dataclasses.py in _asdict_inner(obj, dict_factory)
1080 result = []
1081 for f in fields(obj):
-> 1082 value = _asdict_inner(getattr(obj, f.name), dict_factory)
1083 result.append((f.name, value))
1084 return dict_factory(result)
~/anaconda3/lib/python3.9/dataclasses.py in _asdict_inner(obj, dict_factory)
1108 # generator (which is not true for namedtuples, handled
1109 # above).
-> 1110 return type(obj)(_asdict_inner(v, dict_factory) for v in obj)
1111 elif isinstance(obj, dict):
1112 return type(obj)((_asdict_inner(k, dict_factory),
~/anaconda3/lib/python3.9/dataclasses.py in <genexpr>(.0)
1108 # generator (which is not true for namedtuples, handled
1109 # above).
-> 1110 return type(obj)(_asdict_inner(v, dict_factory) for v in obj)
1111 elif isinstance(obj, dict):
1112 return type(obj)((_asdict_inner(k, dict_factory),
~/anaconda3/lib/python3.9/dataclasses.py in _asdict_inner(obj, dict_factory)
1110 return type(obj)(_asdict_inner(v, dict_factory) for v in obj)
1111 elif isinstance(obj, dict):
-> 1112 return type(obj)((_asdict_inner(k, dict_factory),
1113 _asdict_inner(v, dict_factory))
1114 for k, v in obj.items())
~/anaconda3/lib/python3.9/dataclasses.py in <genexpr>(.0)
1111 elif isinstance(obj, dict):
1112 return type(obj)((_asdict_inner(k, dict_factory),
-> 1113 _asdict_inner(v, dict_factory))
1114 for k, v in obj.items())
1115 else:
~/anaconda3/lib/python3.9/dataclasses.py in _asdict_inner(obj, dict_factory)
1114 for k, v in obj.items())
1115 else:
-> 1116 return copy.deepcopy(obj)
1117
1118
~/anaconda3/lib/python3.9/copy.py in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
173
174 # If is its own copy, don't memoize.
~/anaconda3/lib/python3.9/copy.py in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
268 if state is not None:
269 if deep:
--> 270 state = deepcopy(state, memo)
271 if hasattr(y, '__setstate__'):
272 y.__setstate__(state)
~/anaconda3/lib/python3.9/copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
~/anaconda3/lib/python3.9/copy.py in _deepcopy_dict(x, memo, deepcopy)
228 memo[id(x)] = y
229 for key, value in x.items():
--> 230 y[deepcopy(key, memo)] = deepcopy(value, memo)
231 return y
232 d[dict] = _deepcopy_dict
~/anaconda3/lib/python3.9/copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
~/anaconda3/lib/python3.9/copy.py in _deepcopy_list(x, memo, deepcopy)
203 append = y.append
204 for a in x:
--> 205 append(deepcopy(a, memo))
206 return y
207 d[list] = _deepcopy_list
~/anaconda3/lib/python3.9/copy.py in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
173
174 # If is its own copy, don't memoize.
~/anaconda3/lib/python3.9/copy.py in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
268 if state is not None:
269 if deep:
--> 270 state = deepcopy(state, memo)
271 if hasattr(y, '__setstate__'):
272 y.__setstate__(state)
~/anaconda3/lib/python3.9/copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
~/anaconda3/lib/python3.9/copy.py in _deepcopy_dict(x, memo, deepcopy)
228 memo[id(x)] = y
229 for key, value in x.items():
--> 230 y[deepcopy(key, memo)] = deepcopy(value, memo)
231 return y
232 d[dict] = _deepcopy_dict
~/anaconda3/lib/python3.9/copy.py in deepcopy(x, memo, _nil)
159 reductor = getattr(x, "__reduce_ex__", None)
160 if reductor is not None:
--> 161 rv = reductor(4)
162 else:
163 reductor = getattr(x, "__reduce__", None)
TypeError: cannot pickle '_thread.lock' object
Expected behavior
Training and eval proceed smoothly. I think that Trainer is trying to save the checkpoint and failing then. I'd like to complete training/eval and be able to load from a non-corrupt checkpoint.
I also ran this with no_cuda=True
and received the same error.
Your code example doesn't define multiple objects, so I can't really tell what's wrong. Please give us a minimal reproducer we can execute.
Sorry about that--I've put everything into this repo if that is easier: https://github.com/galenballew/bert-multiclass I'll also repeat it here too:
# Dependencies
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, AdamW
from tqdm import tqdm
import torch
import tools
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
train_texts, train_labels = tools.read_data("train")
val_texts, val_labels = tools.read_data("val")
test_texts, test_labels = tools.read_data("test")
train_texts = train_texts.tolist()
val_texts = val_texts.tolist()
test_texts = test_texts.tolist()
# Create integer class labels instead of strings
classes = tools.labels(train_labels).tolist()
train_labels = tools.relabel(train_labels, classes)
val_labels = tools.relabel(val_labels, classes)
test_labels = tools.relabel(test_labels, classes)
class IntentDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
"""
To support the indexing such that dataset[i] can be used to get the i-th sample
"""
# item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
item['label'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
"""
Returns the size of the dataset.
"""
return len(self.labels)
def compute_metrics(eval_pred):
accuracy = load("accuracy")
precision = load("precision")
f1 = load("f1")
recall = load("recall")
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
accuracy.compute(predictions=predictions, references=labels)
precision.compute(predictions=predictions, references=labels, average="micro")
f1.compute(predictions=predictions, references=labels, average="micro")
recall.compute(predictions=predictions, references=labels, average="micro")
return {"accuracy": accuracy, "precision": precision, "f1": f1, "recall": recall}
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
# Turn the encodings and labels to a dataset object
train_dataset = IntentDataset(train_encodings, train_labels)
val_dataset = IntentDataset(val_encodings, val_labels)
test_dataset = IntentDataset(test_encodings, test_labels)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(classes)).to('cuda')
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
optim="adamw_torch",
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
no_cuda=False,
skip_memory_metrics=True
)
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
Could you also print us trainer.state
? The error comes from the fact it is not JSON-serializable so it would help to know which object in it is not serializable. Thanks!
trainer.state
directly after instantiation:
TrainerState(epoch=None, global_step=0, max_steps=0, num_train_epochs=0, total_flos=0, log_history=[], best_metric=None, best_model_checkpoint=None, is_local_process_zero=True, is_world_process_zero=True, is_hyper_param_search=False, trial_name=None, trial_params=None)
Added this and am including entire output, not just the state. Either the behavior changed or adding try/except is causing a slightly different output:
try:
trainer.train()
except:
print("\n\n")
print("********************")
print("\n\n")
print(trainer.state)
print("\n\n")
print("********************")
print("\n\n")
Trainer is attempting to log a value of "EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
predictions (`list` of `int`): Predicted labels.
references (`list` of `int`): Ground truth labels.
normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
sample_weight (`list` of `float`): Sample weights Defaults to None.
Returns:
accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
Examples:
Example 1-A simple example
>>> accuracy_metric = evaluate.load("accuracy")
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
>>> print(results)
{'accuracy': 0.5}
Example 2-The same as Example 1, except with `normalize` set to `False`.
>>> accuracy_metric = evaluate.load("accuracy")
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
>>> print(results)
{'accuracy': 3.0}
Example 3-The same as Example 1, except with `sample_weight` set.
>>> accuracy_metric = evaluate.load("accuracy")
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
>>> print(results)
{'accuracy': 0.8778625954198473}
""", stored examples: 0)" of type <class 'evaluate_modules.metrics.evaluate-metric--accuracy.f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14.accuracy.Accuracy'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "EvaluationModule(name: "precision", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
predictions (`list` of `int`): Predicted class labels.
references (`list` of `int`): Actual class labels.
labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`. If `average` is `None`, it should be the label order. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
average (`string`): This parameter is required for multiclass/multilabel targets. If set to `None`, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Defaults to `'binary'`.
- 'binary': Only report results for the class specified by `pos_label`. This is applicable only if the classes found in `predictions` and `references` are binary.
- 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.
- 'macro': Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
- 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. This option can result in an F-score that is not between precision and recall.
- 'samples': Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).
sample_weight (`list` of `float`): Sample weights Defaults to None.
zero_division (`int` or `string`): Sets the value to return when there is a zero division. Defaults to 'warn'.
- 0: Returns 0 when there is a zero division.
- 1: Returns 1 when there is a zero division.
- 'warn': Raises warnings and then returns 0 when there is a zero division.
Returns:
precision (`float` or `array` of `float`): Precision score or list of precision scores, depending on the value passed to `average`. Minimum possible value is 0. Maximum possible value is 1. Higher values indicate that fewer negative examples were incorrectly labeled as positive, which means that, generally, higher scores are better.
Examples:
Example 1-A simple binary example
>>> precision_metric = evaluate.load("precision")
>>> results = precision_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0])
>>> print(results)
{'precision': 0.5}
Example 2-The same simple binary example as in Example 1, but with `pos_label` set to `0`.
>>> precision_metric = evaluate.load("precision")
>>> results = precision_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], pos_label=0)
>>> print(round(results['precision'], 2))
0.67
Example 3-The same simple binary example as in Example 1, but with `sample_weight` included.
>>> precision_metric = evaluate.load("precision")
>>> results = precision_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], sample_weight=[0.9, 0.5, 3.9, 1.2, 0.3])
>>> print(results)
{'precision': 0.23529411764705882}
Example 4-A multiclass example, with different values for the `average` input.
>>> predictions = [0, 2, 1, 0, 0, 1]
>>> references = [0, 1, 2, 0, 1, 2]
>>> results = precision_metric.compute(predictions=predictions, references=references, average='macro')
>>> print(results)
{'precision': 0.2222222222222222}
>>> results = precision_metric.compute(predictions=predictions, references=references, average='micro')
>>> print(results)
{'precision': 0.3333333333333333}
>>> results = precision_metric.compute(predictions=predictions, references=references, average='weighted')
>>> print(results)
{'precision': 0.2222222222222222}
>>> results = precision_metric.compute(predictions=predictions, references=references, average=None)
>>> print([round(res, 2) for res in results['precision']])
[0.67, 0.0, 0.0]
""", stored examples: 0)" of type <class 'evaluate_modules.metrics.evaluate-metric--precision.4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f.precision.Precision'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "EvaluationModule(name: "f1", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
predictions (`list` of `int`): Predicted labels.
references (`list` of `int`): Ground truth labels.
labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`, and the order of the labels if `average` is `None`. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
average (`string`): This parameter is required for multiclass/multilabel targets. If set to `None`, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Defaults to `'binary'`.
- 'binary': Only report results for the class specified by `pos_label`. This is applicable only if the classes found in `predictions` and `references` are binary.
- 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.
- 'macro': Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
- 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. This option can result in an F-score that is not between precision and recall.
- 'samples': Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).
sample_weight (`list` of `float`): Sample weights Defaults to None.
Returns:
f1 (`float` or `array` of `float`): F1 score or list of f1 scores, depending on the value passed to `average`. Minimum possible value is 0. Maximum possible value is 1. Higher f1 scores are better.
Examples:
Example 1-A simple binary example
>>> f1_metric = evaluate.load("f1")
>>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0])
>>> print(results)
{'f1': 0.5}
Example 2-The same simple binary example as in Example 1, but with `pos_label` set to `0`.
>>> f1_metric = evaluate.load("f1")
>>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], pos_label=0)
>>> print(round(results['f1'], 2))
0.67
Example 3-The same simple binary example as in Example 1, but with `sample_weight` included.
>>> f1_metric = evaluate.load("f1")
>>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], sample_weight=[0.9, 0.5, 3.9, 1.2, 0.3])
>>> print(round(results['f1'], 2))
0.35
Example 4-A multiclass example, with different values for the `average` input.
>>> predictions = [0, 2, 1, 0, 0, 1]
>>> references = [0, 1, 2, 0, 1, 2]
>>> results = f1_metric.compute(predictions=predictions, references=references, average="macro")
>>> print(round(results['f1'], 2))
0.27
>>> results = f1_metric.compute(predictions=predictions, references=references, average="micro")
>>> print(round(results['f1'], 2))
0.33
>>> results = f1_metric.compute(predictions=predictions, references=references, average="weighted")
>>> print(round(results['f1'], 2))
0.27
>>> results = f1_metric.compute(predictions=predictions, references=references, average=None)
>>> print(results)
{'f1': array([0.8, 0. , 0. ])}
Example 5-A multi-label example
>>> f1_metric = evaluate.load("f1", "multilabel")
>>> results = f1_metric.compute(predictions=[[0, 1, 1], [1, 1, 0]], references=[[0, 1, 1], [0, 1, 0]], average="macro")
>>> print(round(results['f1'], 2))
0.67
""", stored examples: 0)" of type <class 'evaluate_modules.metrics.evaluate-metric--f1.0ca73f6cf92ef5a268320c697f7b940d1030f8471714bffdb6856c641b818974.f1.F1'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "EvaluationModule(name: "recall", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
- **predictions** (`list` of `int`): The predicted labels.
- **references** (`list` of `int`): The ground truth labels.
- **labels** (`list` of `int`): The set of labels to include when `average` is not set to `binary`, and their order when average is `None`. Labels present in the data can be excluded in this input, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in y_true and y_pred are used in sorted order. Defaults to None.
- **pos_label** (`int`): The class label to use as the 'positive class' when calculating the recall. Defaults to `1`.
- **average** (`string`): This parameter is required for multiclass/multilabel targets. If None, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Defaults to `'binary'`.
- `'binary'`: Only report results for the class specified by `pos_label`. This is applicable only if the target labels and predictions are binary.
- `'micro'`: Calculate metrics globally by counting the total true positives, false negatives, and false positives.
- `'macro'`: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
- `'weighted'`: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. Note that it can result in an F-score that is not between precision and recall.
- `'samples'`: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).
- **sample_weight** (`list` of `float`): Sample weights Defaults to `None`.
- **zero_division** (): Sets the value to return when there is a zero division. Defaults to .
- `'warn'`: If there is a zero division, the return value is `0`, but warnings are also raised.
- `0`: If there is a zero division, the return value is `0`.
- `1`: If there is a zero division, the return value is `1`.
Returns:
- **recall** (`float`, or `array` of `float`): Either the general recall score, or the recall scores for individual classes, depending on the values input to `labels` and `average`. Minimum possible value is 0. Maximum possible value is 1. A higher recall means that more of the positive examples have been labeled correctly. Therefore, a higher recall is generally considered better.
Examples:
Example 1-A simple example with some errors
>>> recall_metric = evaluate.load('recall')
>>> results = recall_metric.compute(references=[0, 0, 1, 1, 1], predictions=[0, 1, 0, 1, 1])
>>> print(results)
{'recall': 0.6666666666666666}
Example 2-The same example as Example 1, but with `pos_label=0` instead of the default `pos_label=1`.
>>> recall_metric = evaluate.load('recall')
>>> results = recall_metric.compute(references=[0, 0, 1, 1, 1], predictions=[0, 1, 0, 1, 1], pos_label=0)
>>> print(results)
{'recall': 0.5}
Example 3-The same example as Example 1, but with `sample_weight` included.
>>> recall_metric = evaluate.load('recall')
>>> sample_weight = [0.9, 0.2, 0.9, 0.3, 0.8]
>>> results = recall_metric.compute(references=[0, 0, 1, 1, 1], predictions=[0, 1, 0, 1, 1], sample_weight=sample_weight)
>>> print(results)
{'recall': 0.55}
Example 4-A multiclass example, using different averages.
>>> recall_metric = evaluate.load('recall')
>>> predictions = [0, 2, 1, 0, 0, 1]
>>> references = [0, 1, 2, 0, 1, 2]
>>> results = recall_metric.compute(predictions=predictions, references=references, average='macro')
>>> print(results)
{'recall': 0.3333333333333333}
>>> results = recall_metric.compute(predictions=predictions, references=references, average='micro')
>>> print(results)
{'recall': 0.3333333333333333}
>>> results = recall_metric.compute(predictions=predictions, references=references, average='weighted')
>>> print(results)
{'recall': 0.3333333333333333}
>>> results = recall_metric.compute(predictions=predictions, references=references, average=None)
>>> print(results)
{'recall': array([1., 0., 0.])}
""", stored examples: 0)" of type <class 'evaluate_modules.metrics.evaluate-metric--recall.e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185.recall.Recall'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Exception ignored in: <function BaseFileLock.__del__ at 0x7fb2db3b1160>
Traceback (most recent call last):
File "/home/master/anaconda3/lib/python3.9/site-packages/datasets/utils/filelock.py", line 328, in __del__
self.release(force=True)
File "/home/master/anaconda3/lib/python3.9/site-packages/datasets/utils/filelock.py", line 304, in release
with self._thread_lock:
AttributeError: 'UnixFileLock' object has no attribute '_thread_lock'
********************
TrainerState(epoch=1.0, global_step=944, max_steps=1888, num_train_epochs=2, total_flos=256413353347800.0, log_history=[{'loss': 0.084, 'learning_rate': 1.4703389830508477e-05, 'epoch': 0.53, 'step': 500}, {'eval_loss': 0.2768215239048004, 'eval_accuracy': EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
predictions (`list` of `int`): Predicted labels.
references (`list` of `int`): Ground truth labels.
normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
sample_weight (`list` of `float`): Sample weights Defaults to None.
Returns:
accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
Examples:
Example 1-A simple example
>>> accuracy_metric = evaluate.load("accuracy")
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
>>> print(results)
{'accuracy': 0.5}
Example 2-The same as Example 1, except with `normalize` set to `False`.
>>> accuracy_metric = evaluate.load("accuracy")
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
>>> print(results)
{'accuracy': 3.0}
Example 3-The same as Example 1, except with `sample_weight` set.
>>> accuracy_metric = evaluate.load("accuracy")
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
>>> print(results)
{'accuracy': 0.8778625954198473}
""", stored examples: 0), 'eval_precision': EvaluationModule(name: "precision", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
predictions (`list` of `int`): Predicted class labels.
references (`list` of `int`): Actual class labels.
labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`. If `average` is `None`, it should be the label order. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
average (`string`): This parameter is required for multiclass/multilabel targets. If set to `None`, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Defaults to `'binary'`.
- 'binary': Only report results for the class specified by `pos_label`. This is applicable only if the classes found in `predictions` and `references` are binary.
- 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.
- 'macro': Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
- 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. This option can result in an F-score that is not between precision and recall.
- 'samples': Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).
sample_weight (`list` of `float`): Sample weights Defaults to None.
zero_division (`int` or `string`): Sets the value to return when there is a zero division. Defaults to 'warn'.
- 0: Returns 0 when there is a zero division.
- 1: Returns 1 when there is a zero division.
- 'warn': Raises warnings and then returns 0 when there is a zero division.
Returns:
precision (`float` or `array` of `float`): Precision score or list of precision scores, depending on the value passed to `average`. Minimum possible value is 0. Maximum possible value is 1. Higher values indicate that fewer negative examples were incorrectly labeled as positive, which means that, generally, higher scores are better.
Examples:
Example 1-A simple binary example
>>> precision_metric = evaluate.load("precision")
>>> results = precision_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0])
>>> print(results)
{'precision': 0.5}
Example 2-The same simple binary example as in Example 1, but with `pos_label` set to `0`.
>>> precision_metric = evaluate.load("precision")
>>> results = precision_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], pos_label=0)
>>> print(round(results['precision'], 2))
0.67
Example 3-The same simple binary example as in Example 1, but with `sample_weight` included.
>>> precision_metric = evaluate.load("precision")
>>> results = precision_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], sample_weight=[0.9, 0.5, 3.9, 1.2, 0.3])
>>> print(results)
{'precision': 0.23529411764705882}
Example 4-A multiclass example, with different values for the `average` input.
>>> predictions = [0, 2, 1, 0, 0, 1]
>>> references = [0, 1, 2, 0, 1, 2]
>>> results = precision_metric.compute(predictions=predictions, references=references, average='macro')
>>> print(results)
{'precision': 0.2222222222222222}
>>> results = precision_metric.compute(predictions=predictions, references=references, average='micro')
>>> print(results)
{'precision': 0.3333333333333333}
>>> results = precision_metric.compute(predictions=predictions, references=references, average='weighted')
>>> print(results)
{'precision': 0.2222222222222222}
>>> results = precision_metric.compute(predictions=predictions, references=references, average=None)
>>> print([round(res, 2) for res in results['precision']])
[0.67, 0.0, 0.0]
""", stored examples: 0), 'eval_f1': EvaluationModule(name: "f1", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
predictions (`list` of `int`): Predicted labels.
references (`list` of `int`): Ground truth labels.
labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`, and the order of the labels if `average` is `None`. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
average (`string`): This parameter is required for multiclass/multilabel targets. If set to `None`, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Defaults to `'binary'`.
- 'binary': Only report results for the class specified by `pos_label`. This is applicable only if the classes found in `predictions` and `references` are binary.
- 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.
- 'macro': Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
- 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. This option can result in an F-score that is not between precision and recall.
- 'samples': Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).
sample_weight (`list` of `float`): Sample weights Defaults to None.
Returns:
f1 (`float` or `array` of `float`): F1 score or list of f1 scores, depending on the value passed to `average`. Minimum possible value is 0. Maximum possible value is 1. Higher f1 scores are better.
Examples:
Example 1-A simple binary example
>>> f1_metric = evaluate.load("f1")
>>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0])
>>> print(results)
{'f1': 0.5}
Example 2-The same simple binary example as in Example 1, but with `pos_label` set to `0`.
>>> f1_metric = evaluate.load("f1")
>>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], pos_label=0)
>>> print(round(results['f1'], 2))
0.67
Example 3-The same simple binary example as in Example 1, but with `sample_weight` included.
>>> f1_metric = evaluate.load("f1")
>>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], sample_weight=[0.9, 0.5, 3.9, 1.2, 0.3])
>>> print(round(results['f1'], 2))
0.35
Example 4-A multiclass example, with different values for the `average` input.
>>> predictions = [0, 2, 1, 0, 0, 1]
>>> references = [0, 1, 2, 0, 1, 2]
>>> results = f1_metric.compute(predictions=predictions, references=references, average="macro")
>>> print(round(results['f1'], 2))
0.27
>>> results = f1_metric.compute(predictions=predictions, references=references, average="micro")
>>> print(round(results['f1'], 2))
0.33
>>> results = f1_metric.compute(predictions=predictions, references=references, average="weighted")
>>> print(round(results['f1'], 2))
0.27
>>> results = f1_metric.compute(predictions=predictions, references=references, average=None)
>>> print(results)
{'f1': array([0.8, 0. , 0. ])}
Example 5-A multi-label example
>>> f1_metric = evaluate.load("f1", "multilabel")
>>> results = f1_metric.compute(predictions=[[0, 1, 1], [1, 1, 0]], references=[[0, 1, 1], [0, 1, 0]], average="macro")
>>> print(round(results['f1'], 2))
0.67
""", stored examples: 0), 'eval_recall': EvaluationModule(name: "recall", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
- **predictions** (`list` of `int`): The predicted labels.
- **references** (`list` of `int`): The ground truth labels.
- **labels** (`list` of `int`): The set of labels to include when `average` is not set to `binary`, and their order when average is `None`. Labels present in the data can be excluded in this input, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in y_true and y_pred are used in sorted order. Defaults to None.
- **pos_label** (`int`): The class label to use as the 'positive class' when calculating the recall. Defaults to `1`.
- **average** (`string`): This parameter is required for multiclass/multilabel targets. If None, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Defaults to `'binary'`.
- `'binary'`: Only report results for the class specified by `pos_label`. This is applicable only if the target labels and predictions are binary.
- `'micro'`: Calculate metrics globally by counting the total true positives, false negatives, and false positives.
- `'macro'`: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
- `'weighted'`: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. Note that it can result in an F-score that is not between precision and recall.
- `'samples'`: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).
- **sample_weight** (`list` of `float`): Sample weights Defaults to `None`.
- **zero_division** (): Sets the value to return when there is a zero division. Defaults to .
- `'warn'`: If there is a zero division, the return value is `0`, but warnings are also raised.
- `0`: If there is a zero division, the return value is `0`.
- `1`: If there is a zero division, the return value is `1`.
Returns:
- **recall** (`float`, or `array` of `float`): Either the general recall score, or the recall scores for individual classes, depending on the values input to `labels` and `average`. Minimum possible value is 0. Maximum possible value is 1. A higher recall means that more of the positive examples have been labeled correctly. Therefore, a higher recall is generally considered better.
Examples:
Example 1-A simple example with some errors
>>> recall_metric = evaluate.load('recall')
>>> results = recall_metric.compute(references=[0, 0, 1, 1, 1], predictions=[0, 1, 0, 1, 1])
>>> print(results)
{'recall': 0.6666666666666666}
Example 2-The same example as Example 1, but with `pos_label=0` instead of the default `pos_label=1`.
>>> recall_metric = evaluate.load('recall')
>>> results = recall_metric.compute(references=[0, 0, 1, 1, 1], predictions=[0, 1, 0, 1, 1], pos_label=0)
>>> print(results)
{'recall': 0.5}
Example 3-The same example as Example 1, but with `sample_weight` included.
>>> recall_metric = evaluate.load('recall')
>>> sample_weight = [0.9, 0.2, 0.9, 0.3, 0.8]
>>> results = recall_metric.compute(references=[0, 0, 1, 1, 1], predictions=[0, 1, 0, 1, 1], sample_weight=sample_weight)
>>> print(results)
{'recall': 0.55}
Example 4-A multiclass example, using different averages.
>>> recall_metric = evaluate.load('recall')
>>> predictions = [0, 2, 1, 0, 0, 1]
>>> references = [0, 1, 2, 0, 1, 2]
>>> results = recall_metric.compute(predictions=predictions, references=references, average='macro')
>>> print(results)
{'recall': 0.3333333333333333}
>>> results = recall_metric.compute(predictions=predictions, references=references, average='micro')
>>> print(results)
{'recall': 0.3333333333333333}
>>> results = recall_metric.compute(predictions=predictions, references=references, average='weighted')
>>> print(results)
{'recall': 0.3333333333333333}
>>> results = recall_metric.compute(predictions=predictions, references=references, average=None)
>>> print(results)
{'recall': array([1., 0., 0.])}
""", stored examples: 0), 'eval_runtime': 4.3362, 'eval_samples_per_second': 714.904, 'eval_steps_per_second': 44.739, 'epoch': 1.0, 'step': 944}], best_metric=0.2768215239048004, best_model_checkpoint='./results/checkpoint-944', is_local_process_zero=True, is_world_process_zero=True, is_hyper_param_search=False, trial_name=None, trial_params=None)
********************
So your metrics are not floats, but one ends up being a whole scikit-learn module, this is why you have the issue. The code you pasted is actually super weird:
def compute_metrics(eval_pred):
accuracy = load("accuracy")
precision = load("precision")
f1 = load("f1")
recall = load("recall")
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
accuracy.compute(predictions=predictions, references=labels)
precision.compute(predictions=predictions, references=labels, average="micro")
f1.compute(predictions=predictions, references=labels, average="micro")
recall.compute(predictions=predictions, references=labels, average="micro")
return {"accuracy": accuracy, "precision": precision, "f1": f1, "recall": recall}
You compute the results on predictions and labels but don't store it anywhere, instead you return the metric functions (from evaluate
I guess?) and not the computed values.
Great catch! I modified compute_metrics()
to run successfully without any warnings:
def compute_metrics(eval_pred):
accuracy = load("accuracy")
precision = load("precision")
f1 = load("f1")
recall = load("recall")
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
accuracy_ = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
precision_ = precision.compute(predictions=predictions, references=labels, average="micro")["precision"]
f1_ = f1.compute(predictions=predictions, references=labels, average="micro")["f1"]
recall_ = recall.compute(predictions=predictions, references=labels, average="micro")["recall"]
return {"accuracy": accuracy_, "precision": precision_, "f1": f1_, "recall": recall_}
However, it doesn't seem like the results make sense. That being said, the original issue is definitely no longer an issue. I really appreciate your help--thank you!