evaluate
evaluate copied to clipboard
Add support for glue metric in `EvaluationSuite`
Currently, there does not way to run "evaluate-metric/glue" metric with an Evaluations suite. This PR proposes one implementation to achieve this.
Example of Current Issue
Assume the followin EvaluationSuite defined in glue_suite.py:
import evaluate
from evaluate.evaluation_suite import SubTask
class Suite(evaluate.EvaluationSuite):
def __init__(self, name):
super().__init__(name)
self.preprocessor = lambda x: {"text": x["text"].lower()}
self.suite = [
SubTask(
task_type="text-classification",
data="glue",
subset="sst2",
split="validation[:10]",
args_for_task={
"metric": "glue",
"input_column": "sentence",
"label_column": "label",
"label_mapping": {
"LABEL_0": 0.0,
"LABEL_1": 1.0
}
}
),
]
If we run this code with the following
from evaluate import EvaluationSuite
suite = EvaluationSuite.load('suite.py')
results = suite.run("gpt2")
We get the following error:
-------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[60], line 2
1 suite = EvaluationSuite.load('glue_suite.py')
----> 2 results = suite.run("gpt2")
File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluation_suite/__init__.py:124, in EvaluationSuite.run(self, model_or_pipeline)
122 args_for_task["subset"] = task.subset
123 args_for_task["split"] = task.split
--> 124 results = task_evaluator.compute(**args_for_task)
126 results["task_name"] = task_name + "/" + task.subset if task.subset else task_name
127 results["data_preprocessor"] = str(task.data_preprocessor) if task.data_preprocessor is not None else None
File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/text_classification.py:136, in TextClassificationEvaluator.compute(self, model_or_pipeline, data, subset, split, metric, tokenizer, feature_extractor, strategy, confidence_level, n_resamples, device, random_state, input_column, second_input_column, label_column, label_mapping)
127 metric_inputs, pipe_inputs = self.prepare_data(
128 data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
129 )
130 pipe = self.prepare_pipeline(
131 model_or_pipeline=model_or_pipeline,
132 tokenizer=tokenizer,
133 feature_extractor=feature_extractor,
134 device=device,
135 )
--> 136 metric = self.prepare_metric(metric)
138 # Compute predictions
139 predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/base.py:447, in Evaluator.prepare_metric(self, metric)
445 metric = load(self.default_metric_name)
446 elif isinstance(metric, str):
--> 447 metric = load(metric)
449 return metric
File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/loading.py:735, in load(path, config_name, module_type, process_id, num_process, cache_dir, experiment_id, keep_in_memory, download_config, download_mode, revision, **init_kwargs)
731 evaluation_module = evaluation_module_factory(
732 path, module_type=module_type, revision=revision, download_config=download_config, download_mode=download_mode
733 )
734 evaluation_cls = import_main_class(evaluation_module.module_path)
--> 735 evaluation_instance = evaluation_cls(
736 config_name=config_name,
737 process_id=process_id,
738 num_process=num_process,
739 cache_dir=cache_dir,
740 keep_in_memory=keep_in_memory,
741 experiment_id=experiment_id,
742 hash=evaluation_module.hash,
743 **init_kwargs,
744 )
746 if module_type and module_type != evaluation_instance.module_type:
747 raise TypeError(
748 f"No module of module type '{module_type}' not found for '{path}' locally, or on the Hugging Face Hub. Found module of module type '{evaluation_instance.module_type}' instead."
749 )
File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/module.py:182, in EvaluationModule.__init__(self, config_name, keep_in_memory, cache_dir, num_process, process_id, seed, experiment_id, hash, max_concurrent_cache_files, timeout, **kwargs)
166 def __init__(
167 self,
168 config_name: Optional[str] = None,
(...)
179 ):
180 # prepare info
181 self.config_name = config_name or "default"
--> 182 info = self._info()
183 info.module_name = camelcase_to_snakecase(self.__class__.__name__)
184 info.config_name = self.config_name
File ~/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed/glue.py:122, in Glue._info(self)
107 def _info(self):
108 if self.config_name not in [
109 "sst2",
110 "mnli",
(...)
120 "hans",
121 ]:
--> 122 raise KeyError(
123 "You should supply a configuration name selected in "
124 '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
125 '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
126 )
127 return evaluate.MetricInfo(
128 description=_DESCRIPTION,
129 citation=_CITATION,
(...)
139 format="numpy",
140 )
KeyError: 'You should supply a configuration name selected in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
Solution
This proposal adds config_name as a key to the SubTaksk.args_for_task attribute. Now, by modifying the TextClassificationEvaluator.compute function and the Evaluator.prepare_metric function to allow a config_name argument we can run GLUE in the modified glue_suite.py below:
import evaluate
from evaluate.evaluation_suite import SubTask
class Suite(evaluate.EvaluationSuite):
def __init__(self, name):
super().__init__(name)
self.preprocessor = lambda x: {"text": x["text"].lower()}
self.suite = [
SubTask(
task_type="text-classification",
data="glue",
subset="sst2",
split="validation[:10]",
args_for_task={
"metric": "glue",
"input_column": "sentence",
"label_column": "label",
"config_name": "sst2",
"label_mapping": {
"LABEL_0": 0.0,
"LABEL_1": 1.0
}
}
),
]
Hi @lvwerra Would you please review and provide feedback or assign this one to the right owner?
Thanks.