Issue with the production of MMLU datasets

Open eliyahabba opened this issue 1 year ago • 0 comments

Bug in the MMLU dataset production process:


from unitxt.templates import MultipleChoiceTemplate
# self.template = MultipleChoiceTemplate(type='multiple_choice_template', artifact_identifier='template_0', _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True, postprocessors=['processors.first_character'], instruction='', target_prefix='', title_fields=[], input_format='Question: [question] Choices: [choices] Answer: [answer]\nQuestion: {question} Choices: {choices} Answer:', choices_field='choices', target_field='answer', choices_seperator=' ', source_choice_format='{choice_numeral}. {choice_text}', target_choice_format='{choice_numeral}', enumerator='ABCDEFGHIJKLMNOP', shuffle_choices=False)
template=MultipleChoiceTemplate(
    input_format=self.template.input_format,
    target_field=self.template.target_field,
    choices_seperator=self.template.choices_seperator,
    enumerator="numerals",
    postprocessors=self.template.postprocessors,
)
recipe = StandardRecipe(
    card='cards.mmlu.anatomy',
    template=template,
    format='formats.empty',
    num_demos=1,
    demos_pool_size=10,
    max_train_instances=1,
    max_validation_instances=1,
    max_test_instances=1,
)

dataset = recipe().to_dataset()

The output is: ... Generating train split: 1 examples [00:01, 1.83s/ examples] Generating train split: 1 examples [00:00, 2.79 examples/s] Generating train split: 5 examples [00:00, 8.95 examples/s] Generating train split: 0 examples [00:00, ? examples/s] raise DatasetGenerationError("An error occurred while generating the dataset") from e datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset

when I adjust the parameters of the demos, the process seems to work as expected:

from unitxt.templates import MultipleChoiceTemplate
# self.template = MultipleChoiceTemplate(type='multiple_choice_template', artifact_identifier='template_0', _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True, postprocessors=['processors.first_character'], instruction='', target_prefix='', title_fields=[], input_format='Question: [question] Choices: [choices] Answer: [answer]\nQuestion: {question} Choices: {choices} Answer:', choices_field='choices', target_field='answer', choices_seperator=' ', source_choice_format='{choice_numeral}. {choice_text}', target_choice_format='{choice_numeral}', enumerator='ABCDEFGHIJKLMNOP', shuffle_choices=False)
template=MultipleChoiceTemplate(
    input_format=self.template.input_format,
    target_field=self.template.target_field,
    choices_seperator=self.template.choices_seperator,
    enumerator="numerals",
    postprocessors=self.template.postprocessors,
)
recipe = StandardRecipe(
    card='cards.mmlu.anatomy',
    template=template,
    format='formats.empty',
    num_demos=0,
    demos_pool_size=None,
    max_train_instances=1,
    max_validation_instances=1,
    max_test_instances=1,
)

dataset = recipe().to_dataset()

Mar 21 '24 17:03 eliyahabba