How to use `SubTask` with metrics that require valid `config_name`

Open tybrs opened this issue 2 years ago • 0 comments

Issue

Currently I there does not seem to be a way to define the config_name for metric for a SubTask inside an evaluate.EvaluationSuite.

Version

evaluate version: 0.4.0 transformers version 4.32.0 Python version Python 3.10.6

Example

For example, consider the following EvaluationSuite which tried to run the "glue" metric which requires a config_name when calling evaluate.load:

Code in suite.py:

import evaluate
from evaluate.evaluation_suite import SubTask
class Suite(evaluate.EvaluationSuite):

    def __init__(self, name):
        super().__init__(name)
        self.preprocessor = lambda x: {"text": x["text"].lower()}
        self.suite = [
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="sst2",
                split="validation[:10]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence",
                    "label_column": "label",
                    "label_mapping": {
                        "LABEL_0": 0.0,
                        "LABEL_1": 1.0
                    }
                }
            ),
]

Now consider running this EvaluationSuite with the following:

from evaluate import EvaluationSuite
suite = EvaluationSuite.load('suite.py')
results = suite.run("gpt2")

Running this code results in the following error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[60], line 2
      1 suite = EvaluationSuite.load('suite.py')
----> 2 results = suite.run("gpt2")

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluation_suite/__init__.py:124, in EvaluationSuite.run(self, model_or_pipeline)
    122 args_for_task["subset"] = task.subset
    123 args_for_task["split"] = task.split
--> 124 results = task_evaluator.compute(**args_for_task)
    126 results["task_name"] = task_name + "/" + task.subset if task.subset else task_name
    127 results["data_preprocessor"] = str(task.data_preprocessor) if task.data_preprocessor is not None else None

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/text_classification.py:136, in TextClassificationEvaluator.compute(self, model_or_pipeline, data, subset, split, metric, tokenizer, feature_extractor, strategy, confidence_level, n_resamples, device, random_state, input_column, second_input_column, label_column, label_mapping)
    127 metric_inputs, pipe_inputs = self.prepare_data(
    128     data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
    129 )
    130 pipe = self.prepare_pipeline(
    131     model_or_pipeline=model_or_pipeline,
    132     tokenizer=tokenizer,
    133     feature_extractor=feature_extractor,
    134     device=device,
    135 )
--> 136 metric = self.prepare_metric(metric)
    138 # Compute predictions
    139 predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/base.py:447, in Evaluator.prepare_metric(self, metric)
    445     metric = load(self.default_metric_name)
    446 elif isinstance(metric, str):
--> 447     metric = load(metric)
    449 return metric

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/loading.py:735, in load(path, config_name, module_type, process_id, num_process, cache_dir, experiment_id, keep_in_memory, download_config, download_mode, revision, **init_kwargs)
    731 evaluation_module = evaluation_module_factory(
    732     path, module_type=module_type, revision=revision, download_config=download_config, download_mode=download_mode
    733 )
    734 evaluation_cls = import_main_class(evaluation_module.module_path)
--> 735 evaluation_instance = evaluation_cls(
    736     config_name=config_name,
    737     process_id=process_id,
    738     num_process=num_process,
    739     cache_dir=cache_dir,
    740     keep_in_memory=keep_in_memory,
    741     experiment_id=experiment_id,
    742     hash=evaluation_module.hash,
    743     **init_kwargs,
    744 )
    746 if module_type and module_type != evaluation_instance.module_type:
    747     raise TypeError(
    748         f"No module of module type '{module_type}' not found for '{path}' locally, or on the Hugging Face Hub. Found module of module type '{evaluation_instance.module_type}' instead."
    749     )

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/module.py:182, in EvaluationModule.__init__(self, config_name, keep_in_memory, cache_dir, num_process, process_id, seed, experiment_id, hash, max_concurrent_cache_files, timeout, **kwargs)
    166 def __init__(
    167     self,
    168     config_name: Optional[str] = None,
   (...)
    179 ):
    180     # prepare info
    181     self.config_name = config_name or "default"
--> 182     info = self._info()
    183     info.module_name = camelcase_to_snakecase(self.__class__.__name__)
    184     info.config_name = self.config_name

File ~/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed/glue.py:122, in Glue._info(self)
    107 def _info(self):
    108     if self.config_name not in [
    109         "sst2",
    110         "mnli",
   (...)
    120         "hans",
    121     ]:
--> 122         raise KeyError(
    123             "You should supply a configuration name selected in "
    124             '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
    125             '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
    126         )
    127     return evaluate.MetricInfo(
    128         description=_DESCRIPTION,
    129         citation=_CITATION,
   (...)
    139         format="numpy",
    140     )

KeyError: 'You should supply a configuration name selected in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'

Question

Would there be an issue with developing a feature a config_name parameter for the SubTask class? This parameter would need passed down to be used when the metric is eventually loaded?
Alternatively is the BKM to use inputs for the metric's compute function opposed to config_name in situations like these?

Aug 22 '23 23:08 tybrs