autolabel
autolabel copied to clipboard
[Bug]: if config of json not support chinese
## if config of json not support chinese when the labels use chinese like that then result in error as followed 1.config
{
"task_name": "ScenicAreaInformationClassification",
"task_type": "multilabel_classification",
"dataset": {
"label_column": "labels",
"label_separator": ", ",
"delimiter": ","
},
"model": {
"provider": "openai",
"name": "gpt-3.5-turbo"
},
"prompt": {
"task_guidelines": "You are an expert at providing one or more correct content classifications based on the text.\nYour job is to accurately label the provided input example with one or more of the following categories:\n{labels}",
"output_guidelines": "You will return the answer as a comma separated list of labels sorted in alphabetical order. For example: \"label1, label2, label3\"",
"labels": [
"大山",
"森林",
"海洋"
],
"few_shot_examples": "D:\\graphRag\\graph-re\\label\\seed.csv",
"few_shot_selection": "semantic_similarity",
"few_shot_num": 5,
"example_template": "Input: {example}\nOutput: {labels}"
}
}
2.code
# -*- coding: utf-8 -*-
from dotenv import load_dotenv
from autolabel import LabelingAgent, AutolabelDataset, get_data
import json
import os
load_dotenv() # 默认从项目根目录下的.env文件加载
config='D:\\graphRag\\graph-re\\label\\scene_config.json'
agent = LabelingAgent(config=config)
ds = AutolabelDataset('D:\\graphRag\\graph-re\\label\\hh.csv', config = config)
agent.plan(ds)
ds = agent.run(ds, max_items=10)
ds.save('kk.csv')
print(str(ds.df.head()))
3.result:
Traceback (most recent call last):
File "D:\graphRag\graph-re\label\label_auto.py", line 16, in <module>
agent = LabelingAgent(config=config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anacondaEnev\envs\graphRag\Lib\site-packages\autolabel\labeler.py", line 93, in __init__
config if isinstance(config, AutolabelConfig) else AutolabelConfig(config)
^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anacondaEnev\envs\graphRag\Lib\site-packages\autolabel\configs\config.py", line 61, in __init__
super().__init__(config, validate=validate)
File "D:\anacondaEnev\envs\graphRag\Lib\site-packages\autolabel\configs\base.py", line 18, in __init__
self._validate()
File "D:\anacondaEnev\envs\graphRag\Lib\site-packages\autolabel\configs\config.py", line 67, in _validate
validate(
File "D:\anacondaEnev\envs\graphRag\Lib\site-packages\jsonschema\validators.py", line 1332, in validate
raise error
jsonschema.exceptions.ValidationError: 'task_name' is a required property
Failed validating 'required' in schema:
{'$schema': 'http://json-schema.org/draft-04/schema#',
'title': 'Label Config',
'description': 'The query configuration to generate autolabels',
'type': 'object',
'properties': {'task_name': {'type': 'string',
'description': 'The task name of the '
'labeling job'},
'task_type': {'enum': ['classification',
'named_entity_recognition',
'question_answering',
'entity_matching',
'multilabel_classification',
'attribute_extraction'],
'description': 'The type of auto '
'labeling task'},
'dataset': {'type': 'object',
'properties': {'label_column': {'type': ['string',
'null']},
'label_separator': {'type': ['string',
'null']},
'text_column': {'type': ['string',
'null']},
'delimiter': {'type': ['string',
'null']},
'explanation_column': {'type': ['string',
'null']},
'disable_quoting': {'type': ['boolean',
'null']}},
'additionalProperties': True},
'transforms': {'type': 'array',
'items': {'type': 'object'},
'additionalProperties': True},
'model': {'type': 'object',
'properties': {'provider': {'enum': ['openai',
'anthropic',
'huggingface_pipeline',
'refuel',
'google',
'cohere',
'custom']},
'name': {'type': 'string'},
'compute_confidence': {'type': ['boolean',
'null']},
'logit_bias': {'type': ['number',
'null']},
'params': {'type': ['object',
'null']}},
'required': ['provider', 'name'],
'additionalProperties': True},
'embedding': {'type': 'object',
'properties': {'provider': {'enum': ['openai',
'anthropic',
'huggingface_pipeline',
'refuel',
'google',
'cohere',
'custom']},
'model': {'type': 'string'}},
'additionalProperties': True},
'prompt': {'type': 'object',
'properties': {'task_guidelines': {'type': 'string'},
'output_guidelines': {'type': 'string'},
'labels': {'anyOf': [{'type': 'array',
'items': {'type': 'string'}},
{'type': 'object'}]},
'example_template': {'type': 'string'},
'few_shot_examples': {'anyOf': [{'type': 'array',
'items': {'type': 'object'}},
{'type': 'string'},
{'type': 'null'}]},
'few_shot_selection': {'enum': ['fixed',
'semantic_similarity',
'max_marginal_relevance',
'label_diversity_random',
'label_diversity_similarity'],
'type': ['string',
'null']},
'few_shot_num': {'type': ['number',
'null']},
'chain_of_thought': {'type': ['boolean',
'null']},
'label_selection': {'type': ['boolean',
'null']},
'label_selection_count': {'type': ['number',
'null']},
'attributes': {'anyOf': [{'type': 'array',
'items': {'type': 'object'}},
{'type': 'null'}]}},
'required': ['task_guidelines'],
'additionalProperties': True},
'dataset_generation': {'type': 'object',
'properties': {'num_rows': {'type': ['number',
'null']},
'guidelines': {'type': ['string',
'null']}}}},
'required': ['task_name', 'task_type', 'model', 'prompt'],
'additionalProperties': True}
On instance:
{}