evaluate icon indicating copy to clipboard operation
evaluate copied to clipboard

How to use `SubTask` with metrics that require valid `config_name`

Open tybrs opened this issue 2 years ago • 0 comments

Issue

Currently I there does not seem to be a way to define the config_name for metric for a SubTask inside an evaluate.EvaluationSuite.

Version

evaluate version: 0.4.0 transformers version 4.32.0 Python version Python 3.10.6

Example

For example, consider the following EvaluationSuite which tried to run the "glue" metric which requires a config_name when calling evaluate.load:

Code in suite.py:

import evaluate
from evaluate.evaluation_suite import SubTask
class Suite(evaluate.EvaluationSuite):

    def __init__(self, name):
        super().__init__(name)
        self.preprocessor = lambda x: {"text": x["text"].lower()}
        self.suite = [
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="sst2",
                split="validation[:10]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence",
                    "label_column": "label",
                    "label_mapping": {
                        "LABEL_0": 0.0,
                        "LABEL_1": 1.0
                    }
                }
            ),
]

Now consider running this EvaluationSuite with the following:

from evaluate import EvaluationSuite
suite = EvaluationSuite.load('suite.py')
results = suite.run("gpt2")

Running this code results in the following error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[60], line 2
      1 suite = EvaluationSuite.load('suite.py')
----> 2 results = suite.run("gpt2")

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluation_suite/__init__.py:124, in EvaluationSuite.run(self, model_or_pipeline)
    122 args_for_task["subset"] = task.subset
    123 args_for_task["split"] = task.split
--> 124 results = task_evaluator.compute(**args_for_task)
    126 results["task_name"] = task_name + "/" + task.subset if task.subset else task_name
    127 results["data_preprocessor"] = str(task.data_preprocessor) if task.data_preprocessor is not None else None

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/text_classification.py:136, in TextClassificationEvaluator.compute(self, model_or_pipeline, data, subset, split, metric, tokenizer, feature_extractor, strategy, confidence_level, n_resamples, device, random_state, input_column, second_input_column, label_column, label_mapping)
    127 metric_inputs, pipe_inputs = self.prepare_data(
    128     data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
    129 )
    130 pipe = self.prepare_pipeline(
    131     model_or_pipeline=model_or_pipeline,
    132     tokenizer=tokenizer,
    133     feature_extractor=feature_extractor,
    134     device=device,
    135 )
--> 136 metric = self.prepare_metric(metric)
    138 # Compute predictions
    139 predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/evaluator/base.py:447, in Evaluator.prepare_metric(self, metric)
    445     metric = load(self.default_metric_name)
    446 elif isinstance(metric, str):
--> 447     metric = load(metric)
    449 return metric

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/loading.py:735, in load(path, config_name, module_type, process_id, num_process, cache_dir, experiment_id, keep_in_memory, download_config, download_mode, revision, **init_kwargs)
    731 evaluation_module = evaluation_module_factory(
    732     path, module_type=module_type, revision=revision, download_config=download_config, download_mode=download_mode
    733 )
    734 evaluation_cls = import_main_class(evaluation_module.module_path)
--> 735 evaluation_instance = evaluation_cls(
    736     config_name=config_name,
    737     process_id=process_id,
    738     num_process=num_process,
    739     cache_dir=cache_dir,
    740     keep_in_memory=keep_in_memory,
    741     experiment_id=experiment_id,
    742     hash=evaluation_module.hash,
    743     **init_kwargs,
    744 )
    746 if module_type and module_type != evaluation_instance.module_type:
    747     raise TypeError(
    748         f"No module of module type '{module_type}' not found for '{path}' locally, or on the Hugging Face Hub. Found module of module type '{evaluation_instance.module_type}' instead."
    749     )

File /localdisk/twilbers/src/notebooks/poc/glue/.venv/lib/python3.10/site-packages/evaluate/module.py:182, in EvaluationModule.__init__(self, config_name, keep_in_memory, cache_dir, num_process, process_id, seed, experiment_id, hash, max_concurrent_cache_files, timeout, **kwargs)
    166 def __init__(
    167     self,
    168     config_name: Optional[str] = None,
   (...)
    179 ):
    180     # prepare info
    181     self.config_name = config_name or "default"
--> 182     info = self._info()
    183     info.module_name = camelcase_to_snakecase(self.__class__.__name__)
    184     info.config_name = self.config_name

File ~/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed/glue.py:122, in Glue._info(self)
    107 def _info(self):
    108     if self.config_name not in [
    109         "sst2",
    110         "mnli",
   (...)
    120         "hans",
    121     ]:
--> 122         raise KeyError(
    123             "You should supply a configuration name selected in "
    124             '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
    125             '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
    126         )
    127     return evaluate.MetricInfo(
    128         description=_DESCRIPTION,
    129         citation=_CITATION,
   (...)
    139         format="numpy",
    140     )

KeyError: 'You should supply a configuration name selected in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'

Question

  • Would there be an issue with developing a feature a config_name parameter for the SubTask class? This parameter would need passed down to be used when the metric is eventually loaded?

  • Alternatively is the BKM to use inputs for the metric's compute function opposed to config_name in situations like these?

tybrs avatar Aug 22 '23 23:08 tybrs