Add support for glue metric in `EvaluationSuite`

Open tybrs opened this issue 2 years ago • 1 comments

Currently, there does not way to run "evaluate-metric/glue" metric with an Evaluations suite. This PR proposes one implementation to achieve this.

Example of Current Issue

Assume the followin EvaluationSuite defined in glue_suite.py:

import evaluate
from evaluate.evaluation_suite import SubTask

class Suite(evaluate.EvaluationSuite):

    def __init__(self, name):
        super().__init__(name)
        self.preprocessor = lambda x: {"text": x["text"].lower()}
        self.suite = [
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="sst2",
                split="validation[:10]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence",
                    "label_column": "label",
                    "label_mapping": {
                        "LABEL_0": 0.0,
                        "LABEL_1": 1.0
                    }
                }
            ),
]

If we run this code with the following

from evaluate import EvaluationSuite
suite = EvaluationSuite.load('suite.py')
results = suite.run("gpt2")

We get the following error:

-------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[60], line 2
      1 suite = EvaluationSuite.load('glue_suite.py')
----> 2 results = suite.run("gpt2")

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluation_suite/__init__.py:124, in EvaluationSuite.run(self, model_or_pipeline)
    122 args_for_task["subset"] = task.subset
    123 args_for_task["split"] = task.split
--> 124 results = task_evaluator.compute(**args_for_task)
    126 results["task_name"] = task_name + "/" + task.subset if task.subset else task_name
    127 results["data_preprocessor"] = str(task.data_preprocessor) if task.data_preprocessor is not None else None

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/text_classification.py:136, in TextClassificationEvaluator.compute(self, model_or_pipeline, data, subset, split, metric, tokenizer, feature_extractor, strategy, confidence_level, n_resamples, device, random_state, input_column, second_input_column, label_column, label_mapping)
    127 metric_inputs, pipe_inputs = self.prepare_data(
    128     data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
    129 )
    130 pipe = self.prepare_pipeline(
    131     model_or_pipeline=model_or_pipeline,
    132     tokenizer=tokenizer,
    133     feature_extractor=feature_extractor,
    134     device=device,
    135 )
--> 136 metric = self.prepare_metric(metric)
    138 # Compute predictions
    139 predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/base.py:447, in Evaluator.prepare_metric(self, metric)
    445     metric = load(self.default_metric_name)
    446 elif isinstance(metric, str):
--> 447     metric = load(metric)
    449 return metric

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/loading.py:735, in load(path, config_name, module_type, process_id, num_process, cache_dir, experiment_id, keep_in_memory, download_config, download_mode, revision, **init_kwargs)
    731 evaluation_module = evaluation_module_factory(
    732     path, module_type=module_type, revision=revision, download_config=download_config, download_mode=download_mode
    733 )
    734 evaluation_cls = import_main_class(evaluation_module.module_path)
--> 735 evaluation_instance = evaluation_cls(
    736     config_name=config_name,
    737     process_id=process_id,
    738     num_process=num_process,
    739     cache_dir=cache_dir,
    740     keep_in_memory=keep_in_memory,
    741     experiment_id=experiment_id,
    742     hash=evaluation_module.hash,
    743     **init_kwargs,
    744 )
    746 if module_type and module_type != evaluation_instance.module_type:
    747     raise TypeError(
    748         f"No module of module type '{module_type}' not found for '{path}' locally, or on the Hugging Face Hub. Found module of module type '{evaluation_instance.module_type}' instead."
    749     )

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/module.py:182, in EvaluationModule.__init__(self, config_name, keep_in_memory, cache_dir, num_process, process_id, seed, experiment_id, hash, max_concurrent_cache_files, timeout, **kwargs)
    166 def __init__(
    167     self,
    168     config_name: Optional[str] = None,
   (...)
    179 ):
    180     # prepare info
    181     self.config_name = config_name or "default"
--> 182     info = self._info()
    183     info.module_name = camelcase_to_snakecase(self.__class__.__name__)
    184     info.config_name = self.config_name

File ~/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed/glue.py:122, in Glue._info(self)
    107 def _info(self):
    108     if self.config_name not in [
    109         "sst2",
    110         "mnli",
   (...)
    120         "hans",
    121     ]:
--> 122         raise KeyError(
    123             "You should supply a configuration name selected in "
    124             '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
    125             '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
    126         )
    127     return evaluate.MetricInfo(
    128         description=_DESCRIPTION,
    129         citation=_CITATION,
   (...)
    139         format="numpy",
    140     )

KeyError: 'You should supply a configuration name selected in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'

Solution

This proposal adds config_name as a key to the SubTaksk.args_for_task attribute. Now, by modifying the TextClassificationEvaluator.compute function and the Evaluator.prepare_metric function to allow a config_name argument we can run GLUE in the modified glue_suite.py below:

import evaluate
from evaluate.evaluation_suite import SubTask
class Suite(evaluate.EvaluationSuite):

    def __init__(self, name):
        super().__init__(name)
        self.preprocessor = lambda x: {"text": x["text"].lower()}
        self.suite = [
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="sst2",
                split="validation[:10]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence",
                    "label_column": "label",
                    "config_name": "sst2",
                    "label_mapping": {
                        "LABEL_0": 0.0,
                        "LABEL_1": 1.0
                    }
                }
            ),
]

Aug 25 '23 02:08 tybrs

Hi @lvwerra Would you please review and provide feedback or assign this one to the right owner?

Thanks.

Sep 12 '23 16:09 ashahba