ragas
ragas copied to clipboard
unable to apply transformation: 'headlines' property not found in this node
Your Question What should I do when I encountered these errors when generating test data in a non English language?
Code Examples
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from ragas.testset.persona import Persona
from ragas.testset.transforms.extractors.llm_based import NERExtractor
from ragas.testset.transforms.splitters import HeadlineSplitter
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers.single_hop.specific import (
SingleHopSpecificQuerySynthesizer,
)
import os
import asyncio
os.environ["OPENAI_API_KEY"] = "sk-xxxxxxxx"
# 你的其他导入和环境设置保持不变
async def generate_data():
path = "D:/ragas"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()
print(len(docs))
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="deepseek-chat", base_url="xxxx", api_key="sk-xxxxxxx"))
embedding_model_name = r"embedding\bge-large-zh-v1.5"
embedding_model_kwargs = {'device': 'cpu'}
embedding_encode_kwargs = {'batch_size': 32, 'normalize_embeddings': True}
embed_model = HuggingFaceEmbeddings(
model_name=embedding_model_name,
model_kwargs=embedding_model_kwargs,
encode_kwargs=embedding_encode_kwargs
)
generator_embeddings = LangchainEmbeddingsWrapper(embed_model)
personas = [
Persona(
name="好奇的学生",
role_description="对世界充满好奇并希望更多地了解不同文化和语言的学生",
),
]
transforms = [HeadlineSplitter(), NERExtractor()]
generator = TestsetGenerator(
llm=generator_llm, embedding_model=generator_embeddings, persona_list=personas
)
distribution = [
(SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
]
for query, _ in distribution:
prompts = await query.adapt_prompts("chinese", llm=generator_llm)
query.set_prompts(**prompts)
dataset = generator.generate_with_langchain_docs(
docs[:],
testset_size=5,
transforms=transforms,
query_distribution=distribution,
)
print(dataset)
eval_dataset = dataset.to_evaluation_dataset()
print(eval_dataset)
df = eval_dataset.to_pandas()
df.to_csv("ragas_generate_data.csv", encoding='utf-8-sig')
# 使用 asyncio.run 来运行异步函数
if __name__ == "__main__":
asyncio.run(generate_data())
error
D:\CRAG\ragas_generate_data.py:136: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.
embed_model = HuggingFaceEmbeddings(
Applying HeadlineSplitter: 0%| | 0/1 [00:00<?, ?it/s]unable to apply transformation: 'headlines' property not found in this node
Applying NERExtractor: 0%| | 0/1 [00:00<?, ?it/s]unable to apply transformation: Connection error.
Generating Scenarios: 100%|██████████| 1/1 [00:04<00:00, 4.83s/it]
Generating Samples: 0it [00:00, ?it/s]
Traceback (most recent call last):
File "D:\CRAG\ragas_generate_data.py", line 180, in <module>
asyncio.run(generate_data())
File "D:\anaconda\envs\crag\Lib\site-packages\nest_asyncio.py", line 30, in run
return loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda\envs\crag\Lib\site-packages\nest_asyncio.py", line 98, in run_until_complete
return f.result()
^^^^^^^^^^
File "D:\anaconda\envs\crag\Lib\asyncio\futures.py", line 203, in result
raise self._exception.with_traceback(self._exception_tb)
File "D:\anaconda\envs\crag\Lib\asyncio\tasks.py", line 267, in __step
result = coro.send(None)
^^^^^^^^^^^^^^^
File "D:\CRAG\ragas_generate_data.py", line 171, in generate_data
print(dataset)
File "D:\anaconda\envs\crag\Lib\site-packages\ragas\dataset_schema.py", line 277, in __str__
return f"EvaluationDataset(features={self.features()}, len={len(self.samples)})"
^^^^^^^^^^^^^^^
File "D:\anaconda\envs\crag\Lib\site-packages\ragas\dataset_schema.py", line 226, in features
return self.samples[0].get_features()
~~~~~~~~~~~~^^^
IndexError: list index out of range
Process finished with exit code -1073741819 (0xC0000005)
Hello! @Z-oo883
I had the same issue
I solved it by adding HeadlinesExtrcator in transforms
Before: transforms = [HeadlineSplitter(), NERExtractor()]
After: transforms = [HeadlinesExtractor(), HeadlineSplitter(), NERExtractor()]
In order to use HeadlineSplitter, you should add HeadlinesExtrcator in advance.
HeadlineSplitter splits each chunk with headline, but to do that, it needs headlines which is made by HeadlinesExtrcator