ragas ValueError: a cannot be empty unless no samples are taken

[ ] I have checked the documentation and related resources and couldn't resolve my bug.

Describe the bug ValueError: a cannot be empty unless no samples are taken

Ragas version:0.1.10 Python version:

Code to Reproduce

from ragas.testset.generator import TestsetGenerator from ragas.testset.evolutions import simple, reasoning, multi_context import nest_asyncio from langchain_community.document_loaders import PubMedLoader from langchain.text_splitter import CharacterTextSplitter from ragas.testset.docstore import InMemoryDocumentStore from ragas.testset.extractor import KeyphraseExtractor from langchain.embeddings import HuggingFaceEmbeddings from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline ) from langchain.llms import HuggingFacePipeline import torch

model_name='mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right"

use_4bit = True

bnb_4bit_compute_dtype = "float16"

bnb_4bit_quant_type = "nf4"

use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, )

mistral_model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, )

embedding_model_name = 'BAAI/bge-small-en-v1.5' embeddings = HuggingFaceEmbeddings( model_name=embedding_model_name, model_kwargs={'device': 'cuda:0'} )

response_generation_pipeline = pipeline( model=mistral_model, tokenizer=tokenizer, task="text-generation", temperature=0.1, repetition_penalty=1.1, return_full_text=True, max_new_tokens=500, do_sample=True,

) response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

nest_asyncio.apply() loader = PubMedLoader("liver", load_max_docs=10) documents = loader.load() print(len(documents))

generator = TestsetGenerator.from_langchain( generator_llm=response_generation_llm, critic_llm=response_generation_llm, embeddings=embeddings, )

testset = generator.generate_with_langchain_docs(documents, test_size=2, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},is_async = False,raise_exceptions=False)

Error trace File "/home/ubuntu/scp-analyzer/rag-old/test_set.py", line 100, in testset = generator.generate_with_langchain_docs(documents, test_size=2, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},is_async = False,raise_exceptions=False) File "/home/ubuntu/.local/lib/python3.10/site-packages/ragas/testset/generator.py", line 210, in generate_with_langchain_docs return self.generate( File "/home/ubuntu/.local/lib/python3.10/site-packages/ragas/testset/generator.py", line 279, in generate for n in self.docstore.get_random_nodes(k=test_size) File "/home/ubuntu/.local/lib/python3.10/site-packages/ragas/testset/docstore.py", line 329, in get_random_nodes nodes = rng.choice(np.array(self.nodes), size=k, p=prob).tolist() File "numpy/random/_generator.pyx", line 803, in numpy.random._generator.Generator.choice ValueError: a cannot be empty unless no samples are taken

Expected behavior A clear and concise description of what you expected to happen.

Additional context Add any other context about the problem here. @jjmachan @shahules786

Jul 18 '24 07:07 Rugved2204

The error ValueError: a cannot be empty unless no samples are taken occurs because the list of documents loaded from PubMedLoader is empty. Ensure that the documents list is not empty before passing it to the TestsetGenerator. You can add a check to verify that documents are loaded successfully:

from langchain_community.document_loaders import PubMedLoader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
import torch
import nest_asyncio

model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

embedding_model_name = 'BAAI/bge-small-en-v1.5'
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': 'cuda:0'}
)

response_generation_pipeline = pipeline(
    model=mistral_model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=500,
    do_sample=True,
)

response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

nest_asyncio.apply()
loader = PubMedLoader("liver", load_max_docs=10)
documents = loader.load()

# Check if documents are loaded
if not documents:
    raise ValueError("No documents were loaded. Please check your query or parameters.")

print(len(documents))

generator = TestsetGenerator.from_langchain(
    generator_llm=response_generation_llm,
    critic_llm=response_generation_llm,
    embeddings=embeddings,
)

testset = generator.generate_with_langchain_docs(
    documents, 
    test_size=2, 
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
    is_async=False,
    raise_exceptions=False
)

This will ensure that you do not attempt to generate a test set with an empty list of documents ^[1]^[2].

To continue talking to Dosu, mention @dosu.

Jul 18 '24 09:07 dosubot[bot]

the list is not empty @dosu

Jul 18 '24 11:07 Rugved2204

If the list of documents is not empty and you are still encountering the ValueError: a cannot be empty unless no samples are taken error, it might be due to the docstore not being populated correctly. Ensure that the documents are added to the docstore before generating the test set.

Here's an updated version of your code to ensure the docstore is populated:

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
import nest_asyncio
from langchain_community.document_loaders import PubMedLoader
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
import torch

model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

embedding_model_name = 'BAAI/bge-small-en-v1.5'
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': 'cuda:0'}
)

response_generation_pipeline = pipeline(
    model=mistral_model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=500,
    do_sample=True,
)

response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

nest_asyncio.apply()
loader = PubMedLoader("liver", load_max_docs=10)
documents = loader.load()

# Check if documents are loaded
if not documents:
    raise ValueError("No documents were loaded. Please check your query or parameters.")

print(len(documents))

generator = TestsetGenerator.from_langchain(
    generator_llm=response_generation_llm,
    critic_llm=response_generation_llm,
    embeddings=embeddings,
)

# Add documents to the docstore
generator.docstore.add_documents(documents)
print(f"Docstore contains {len(generator.docstore.nodes)} nodes")

# Generate the test set
testset = generator.generate_with_langchain_docs(
    documents, 
    test_size=2, 
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
    is_async=False,
    raise_exceptions=False
)

This ensures that the documents are added to the docstore before attempting to generate the test set ^[1].

To continue talking to Dosu, mention @dosu.

Jul 18 '24 11:07 dosubot[bot]

Hi @dosu,

I have the exact same problem and noticed that in KeyphraseExtractor->extract method the extracted keyphrases is a list of keyphrases dicts:

[{'keyphrases': ['Black hole', 'Region of spacetime', 'Strong gravity', 'Light and electromagnetic waves', 'Theory of general relativity']}, {'keyphrases': ['Great Wall of China', 'Ancient fortifications', 'Northern China']}, {'keyphrases': ['New Samsung Washer', 'Installation', 'Use and Care', 'Important Information', 'Manual', 'Operating Instructions', 'Warnings and Important Safety Instructions', 'Common Sense', 'Caution and Care', 'Electricity and Moving Parts', 'Personal Injury', 'Death', 'Property Damage']}]

The first two seems to be based on the examples in the prompt. The last one is from the document that I am feeding it.

On line 53 in ragas.testset.extractor.py:

async def extract(self, node: Node, is_async: bool = True) -> t.List[str]:
    prompt = self.extractor_prompt.format(text=node.page_content)
    results = await self.llm.generate(prompt=prompt, is_async=is_async)
    keyphrases = await json_loader.safe_load(
        results.generations[0][0].text.strip(), llm=self.llm, is_async=is_async
    )
    keyphrases = keyphrases if isinstance(keyphrases, dict) else {}
    logger.debug("topics: %s", keyphrases)
    return keyphrases.get("keyphrases", [])

the keyphrases is set to an empty dict if the json_loader returns a list, as in my case. I am not sure what to do if this is the correct way to handle the valid keyphrases being generated.

This means that the nodes are not being appended on line 264 (which is not being executed) in ragas.testset.docstore.py:

def add_nodes(self, nodes: t.Sequence[Node], show_progress=True):
    assert self.embeddings is not None, "Embeddings must be set"
    assert self.extractor is not None, "Extractor must be set"

    # NOTE: Adds everything in async mode for now.
    nodes_to_embed = {}
    nodes_to_extract = {}

    # get embeddings for the docs
    executor = Executor(
        desc="embedding nodes",
        keep_progress_bar=False,
        raise_exceptions=True,
        run_config=self.run_config,
    )
    result_idx = 0
    for i, n in enumerate(nodes):
        if n.embedding is None:
            nodes_to_embed.update({i: result_idx})
            executor.submit(
                self.embeddings.embed_text,
                n.page_content,
                name=f"embed_node_task[{i}]",
            )
            result_idx += 1

        if not n.keyphrases:
            nodes_to_extract.update({i: result_idx})
            executor.submit(
                self.extractor.extract,
                n,
                name=f"keyphrase-extraction[{i}]",
            )
            result_idx += 1
    results = executor.results()
    if not results:
        raise ExceptionInRunner()

    for i, n in enumerate(nodes):
        if i in nodes_to_embed.keys():
            n.embedding = results[nodes_to_embed[i]]
        if i in nodes_to_extract.keys():
            keyphrases = results[nodes_to_extract[i]]
            n.keyphrases = keyphrases

        if n.embedding is not None and n.keyphrases != []:
            self.nodes.append(n)
            self.node_map[n.doc_id] = n
            assert isinstance(
                n.embedding, (list, np.ndarray)
            ), "Embedding must be list or np.ndarray"
            self.node_embeddings_list.append(n.embedding)

    self.calculate_nodes_docs_similarity()
    self.set_node_relataionships()

Jul 20 '24 14:07 samiislam

@shahules786 can you take a look at this?

Aug 02 '24 06:08 jjmachan

您好，

我遇到了完全相同的问题，并注意到在 KeyphraseExtractor->extract 方法中，提取的 keyphrases 是一个 keyphrases dicts 列表：

[{'keyphrases': ['Black hole', 'Region of spacetime', 'Strong gravity', 'Light and electromagnetic waves', 'Theory of general relativity']}, {'keyphrases': ['Great Wall of China', 'Ancient fortifications', 'Northern China']}, {'keyphrases': ['New Samsung Washer', 'Installation', 'Use and Care', 'Important Information', 'Manual', 'Operating Instructions', 'Warnings and Important Safety Instructions', 'Common Sense', 'Caution and Care', 'Electricity and Moving Parts', 'Personal Injury', 'Death', 'Property Damage']}]

前两个似乎是基于提示中的示例。最后一个来自我正在提供给它的文档。

在 ragas.testset.extractor.py 的 53 号线路上：

async def extract(self, node: Node, is_async: bool = True) -> t.List[str]:
    prompt = self.extractor_prompt.format(text=node.page_content)
    results = await self.llm.generate(prompt=prompt, is_async=is_async)
    keyphrases = await json_loader.safe_load(
        results.generations[0][0].text.strip(), llm=self.llm, is_async=is_async
    )
    keyphrases = keyphrases if isinstance(keyphrases, dict) else {}
    logger.debug("topics: %s", keyphrases)
    return keyphrases.get("keyphrases", [])

如果 keyPhrases 返回一个列表，则 json_loader 将设置为空 dict，就像我的例子一样。如果这是处理生成的有效关键词的正确方法，我不确定该怎么办。

这意味着节点不会附加到 ragas.testset.docstore.py 中的第 264 行（未执行）：

def add_nodes(self, nodes: t.Sequence[Node], show_progress=True):
    assert self.embeddings is not None, "Embeddings must be set"
    assert self.extractor is not None, "Extractor must be set"

    # NOTE: Adds everything in async mode for now.
    nodes_to_embed = {}
    nodes_to_extract = {}

    # get embeddings for the docs
    executor = Executor(
        desc="embedding nodes",
        keep_progress_bar=False,
        raise_exceptions=True,
        run_config=self.run_config,
    )
    result_idx = 0
    for i, n in enumerate(nodes):
        if n.embedding is None:
            nodes_to_embed.update({i: result_idx})
            executor.submit(
                self.embeddings.embed_text,
                n.page_content,
                name=f"embed_node_task[{i}]",
            )
            result_idx += 1

        if not n.keyphrases:
            nodes_to_extract.update({i: result_idx})
            executor.submit(
                self.extractor.extract,
                n,
                name=f"keyphrase-extraction[{i}]",
            )
            result_idx += 1
    results = executor.results()
    if not results:
        raise ExceptionInRunner()

    for i, n in enumerate(nodes):
        if i in nodes_to_embed.keys():
            n.embedding = results[nodes_to_embed[i]]
        if i in nodes_to_extract.keys():
            keyphrases = results[nodes_to_extract[i]]
            n.keyphrases = keyphrases

        if n.embedding is not None and n.keyphrases != []:
            self.nodes.append(n)
            self.node_map[n.doc_id] = n
            assert isinstance(
                n.embedding, (list, np.ndarray)
            ), "Embedding must be list or np.ndarray"
            self.node_embeddings_list.append(n.embedding)

    self.calculate_nodes_docs_similarity()
    self.set_node_relataionships()

I have the same bugs as you, did you sovel it now ?

Sep 10 '24 09:09 zhi-jiao

I am not using RAGAS at the moment since the PoC I am working on just needed some evaluation. I switched to TruLens which was easier for me to get up and running.

Sep 10 '24 16:09 samiislam