haystack
haystack copied to clipboard
QA problem in using QdrantDocumentStore
I used QdrantDocumentStore
for creating a vector database by this code:
def document_store(index:str, uploaded_file, citation:str, progress=gr.Progress()):
document_store = QdrantDocumentStore(host="localhost",
index=index,
embedding_dim=768,
recreate_index=False, # Note: If you want add another data later, set to "False"
timeout=120,
hnsw_config={"m": 16, "ef_construct": 64}
)
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
cleaner = DocumentCleaner(
remove_empty_lines = True,
remove_extra_whitespaces = False,
remove_repeated_substrings = True,
)
document_joiner = DocumentJoiner()
document_embedder = SentenceTransformersDocumentEmbedder()
splitter = DocumentSplitter(split_by="sentence", split_length=200, split_overlap=50)
writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=file_type_router, name="file_type_router")
indexing_pipeline.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer")
indexing_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
indexing_pipeline.add_component(instance=document_joiner, name="document_joiner")
indexing_pipeline.add_component(instance = cleaner, name="document_cleaner")
indexing_pipeline.add_component(instance = splitter, name="document_splitter")
indexing_pipeline.add_component(instance=document_embedder, name="document_embedder")
indexing_pipeline.add_component(instance = writer, name="document_writer")
indexing_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
indexing_pipeline.connect("metadata_multiplexer", "markdown_converter.meta")
indexing_pipeline.connect("markdown_converter", "document_joiner")
indexing_pipeline.connect("document_joiner", "document_cleaner")
indexing_pipeline.connect("document_cleaner", "document_splitter")
indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer")
## Adding Citation as Metadata for Papers
# Convert BibTex to Dictionary
kv = re.compile(r'\b(?P<key>\w+)={(?P<value>[^}]+)}')
citation = citation
meta_bibtex = dict(kv.findall(citation))
# Adding `date_added` kay-value to metadata
meta_bibtex["date_added"] = datetime.now().isoformat()
## Run the pipeline with the file you want to index
indexing_pipeline.run({"file_type_router": {"sources": [str(uploaded_file)]},
"metadata_multiplexer": {"value": meta_bibtex},
})
return "Text of your PDF file successfully added to Qdrant [" + index + "] collection."
Embedding multiple texts of multiple pdf files in the document store is done successfully, but when I want to query this document store, it uses only the last pdf file uploaded in qdrant document store, and if I query about other pdf files, it returns nothing.
I use this code for querying:
document_store = QdrantDocumentStore(url="localhost",
index="aasents",
embedding_dim=768,
hnsw_config={"m": 16, "ef_construct": 64}
)
generator = OllamaGenerator(model="phi3")
text_embedder = SentenceTransformersTextEmbedder(model="BAAI/bge-base-en-v1.5")
text_embedder.warm_up()
template = """
Answer the questions based on the given context.
Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}
Question: {{ question }}
Answer:
"""
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder",text_embedder)
query_pipeline.add_component("retriever", QdrantEmbeddingRetriever(document_store=document_store))
query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
query_pipeline.add_component("llm", generator)
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever", "prompt_builder.documents")
query_pipeline.connect("prompt_builder", "llm")
question = "what is Spatial Variability and Periodicity of Precipitation in the Middle Reaches of the Yellow River?"
result = query_pipeline.run({
"text_embedder": {"text": question},
"retriever": {"top_k": 10},
"prompt_builder":{"question": question},
})
print(result["llm"]["replies"])
How can I use all vectors available in Qdrant document store for QA?
Does the retriever only return 1 document? If you're on the latest version (2.1.0) pass this to your pipeline: query_pipeline.run(data, include_outputs_from={"retriever"})
, you can see what gets returned by the retriever.
Does the retriever only return 1 document? If you're on the latest version (2.1.0) pass this to your pipeline:
query_pipeline.run(data, include_outputs_from={"retriever"})
, you can see what gets returned by the retriever.
Hi @mrm1001, thanks for your attention. what's your meaning data
argument? I tried solve this problem by using filter of metadata
:
result = query_pipeline.run({
"dense_text_embedder": {"text": question},
"sparse_text_embedder": {"text": question},
"retriever": {"top_k": 2,
"filters": {"field": "meta.title", "operator": "==",
"value": value_rf}},
"prompt_builder":{"question": question},
})
It returns the correct result, but I don't know if my solution is correct or not!
What I meant is, can you try to see what is returned by the retriever, without adding the filters?
@NILICK in your document_store
method, you are creating the QDrant collection with the value index
and in your retrieve, you have it hardcoded. If you want all of your documents to be in the same collection, keep it a constant or leave it blank to use Document
as the default value for both pipelines. In that case, it should then query all of your documents.