This is the data I wrote with ChatOpenAI as the interface of tongyi, but I encountered the following problems, why?

import os from langchain_openai import ChatOpenAI, OpenAIEmbeddings from datasets import Dataset from ragas import evaluate from ragas.metrics import context_precision, context_recall, faithfulness, answer_relevancy

1. 配置阿里云通义千问模型

设置阿里云提供的 API Key 和 base URL

api= "my_apikey" # 替换为您的阿里云 API Key BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" # 阿里云 OpenAI 兼容接口地址

初始化 ChatOpenAI，使用阿里云的通义千问模型

llm = ChatOpenAI( api_key=api, base_url=BASE_URL, model="qwen-plus" # 选择您需要的模型名称 )

使用阿里云的嵌入模型

embedding_model = OpenAIEmbeddings( model="text-embedding-v3", # 选择阿里云提供的嵌入模型 openai_api_key=api, openai_api_base=BASE_URL )

2. 构造 RAGAS 评估数据集

---------------------------

数据集格式要求包含：question, answer, contexts, ground_truths

data = { "question": [ "恐龙是如何被命名的？", "恐龙在哪个时代灭绝的？" ], "answer": [ "恐龙这个名字来源于希腊语，意为‘恐怖的蜥蜴’。", "恐龙大约在6500万年前的白垩纪末期灭绝。" ], "contexts": [ ["在19世纪初期，科学家发现了一些不同于现存动物的化石，并认为这些生物非常巨大且凶猛，因此命名为恐龙。"], ["化石记录显示，恐龙在白垩纪末期遭遇了剧烈的环境变化，导致大规模灭绝。"] ], "ground_truths": [ ["恐龙名字来源于希腊语，意为恐怖的蜥蜴"], # ["恐龙在大约6500万年前的白垩纪末期灭绝"] # ], "reference": [ ["恐龙名字来源于希腊语，意为恐怖的蜥蜴"], # ["恐龙在大约6500万年前的白垩纪末期灭绝"] # ] }

dataset = Dataset.from_dict(data) metrics = [context_precision, context_recall, faithfulness, answer_relevancy] result = evaluate(dataset=dataset, metrics=metrics,llm=llm,embeddings=embedding_model)

将评估结果转换为 Pandas DataFrame 格式后打印输出

df = result.to_pandas() print("评估结果：") print(df)

The question is as follows

ValidationError: 1 validation error for SingleTurnSample reference Input should be a valid string [type=string_type, input_value=['恐龙名字来源于...意为恐怖的蜥蜴'], input_type=list] For further information visit https://errors.pydantic.dev/2.9/v/string_type

Mar 11 '25 15:03 729973389

Hi @729973389, could you please let me know which version of ragas you’re using?

Mar 12 '25 02:03 sahusiddharth

ragas==0.2.14

Mar 12 '25 14:03 729973389

Hi @729973389,

from datasets import Dataset
from ragas.metrics import context_precision, context_recall, faithfulness, answer_relevancy
from ragas import evaluate

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
embedding_model = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

# Sample data
data = {
    "question": [
        "恐龙是如何被命名的？",
        "恐龙在哪个时代灭绝的？"
    ],
    "answer": [
        "恐龙这个名字来源于希腊语，意为‘恐怖的蜥蜴’。",
        "恐龙大约在6500万年前的白垩纪末期灭绝。"
    ],
    "contexts": [
        ["在19世纪初期，科学家发现了一些不同于现存动物的化石，并认为这些生物非常巨大且凶猛，因此命名为恐龙。"],
        ["化石记录显示，恐龙在白垩纪末期遭遇了剧烈的环境变化，导致大规模灭绝。"]
    ],
    "ground_truths": [
        "恐龙名字来源于希腊语，意为恐怖的蜥蜴", # 已更改：列表改为字符串
        "恐龙在大约6500万年前的白垩纪末期灭绝" # 已更改：列表改为字符串
    ],
    "reference": [
        "恐龙名字来源于希腊语，意为恐怖的蜥蜴", # 已更改：列表改为字符串
        "恐龙在大约6500万年前的白垩纪末期灭绝" # 已更改：列表改为字符串
    ]
}

# Create a dataset from the dictionary
dataset = Dataset.from_dict(data)

metrics = [context_precision, context_recall, faithfulness, answer_relevancy]

# Evaluate the dataset
result = evaluate(dataset=dataset, metrics=metrics, llm=llm, embeddings=embedding_model)

# Convert results to a pandas DataFrame and print
df = result.to_pandas()
print("评估结果：")
df

我建议使用 ragas 的 EvaluationDataset，而不是 Huggingface 的 Dataset。

from ragas.dataset_schema import EvaluationDataset
from ragas.metrics import ContextPrecision, ContextRecall, Faithfulness, AnswerRelevancy
from ragas import evaluate

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
embedding_model = LangchainEmbeddingsWrapper(
    OpenAIEmbeddings(model="text-embedding-3-small")
)

data = [
    {
        "user_input": "恐龙是如何被命名的？",
        "response": "恐龙这个名字来源于希腊语，意为‘恐怖的蜥蜴’。",
        "reference": "恐龙名字来源于希腊语，意为恐怖的蜥蜴",
        "retrieved_contexts": [
            "在19世纪初期，科学家发现了一些不同于现存动物的化石，并认为这些生物非常巨大且凶猛，因此命名为恐龙。"
        ],
    },
    {
        "user_input": "恐龙在哪个时代灭绝的？",
        "response": "恐龙大约在6500万年前的白垩纪末期灭绝。",
        "reference": "恐龙在大约6500万年前的白垩纪末期灭绝",
        "retrieved_contexts": [
            "化石记录显示，恐龙在白垩纪末期遭遇了剧烈的环境变化，导致大规模灭绝。"
        ],
    },
]

# Sample data
dataset = EvaluationDataset.from_list(data=data)

metrics = [
    ContextPrecision(llm=llm),
    ContextRecall(llm=llm),
    Faithfulness(llm=llm),
    AnswerRelevancy(llm=llm),
]

# Evaluate the dataset
result = evaluate(dataset=dataset, metrics=metrics)

# Convert results to a pandas DataFrame and print
df = result.to_pandas()
df

你可以在文档中了解更多关于 ragas 的 EvaluationDataset，文档链接是 https://docs.ragas.io/en/stable/concepts/components/eval_dataset/。

Mar 14 '25 04:03 sahusiddharth