haystack
haystack copied to clipboard
Building a Vector Database from Specific Text
I'm using Haystack 2.0 and Qdrant to create a vector database for PDF files. I wrote the following code to achieve this:
from pathlib import Path
from typing import Dict, Any
from datetime import datetime
from haystack import Pipeline
from haystack.components.others import Multiplexer
from haystack.components.routers import FileTypeRouter
from haystack.components.writers import DocumentWriter
from haystack.components.joiners import DocumentJoiner
from haystack.components.builders import PromptBuilder
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
document_store = QdrantDocumentStore(host="localhost",
index="Test",
embedding_dim=768,
recreate_index=False, # If you want add another data later, set to "False"
timeout=120,
hnsw_config={"m": 16, "ef_construct": 64}
)
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
cleaner = DocumentCleaner(
remove_empty_lines = True,
remove_extra_whitespaces = True,
remove_repeated_substrings = True,
remove_substrings = ["References"],
)
document_joiner = DocumentJoiner()
document_embedder = SentenceTransformersDocumentEmbedder()
splitter = DocumentSplitter(split_by="passage", split_length=200, split_overlap=1)
writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
## Add components to the pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=file_type_router, name="file_type_router")
indexing_pipeline.add_component(instance = pdf_converter, name="pdf_file_converter")
indexing_pipeline.add_component(instance=document_joiner, name="document_joiner")
indexing_pipeline.add_component(instance = cleaner, name="document_cleaner")
indexing_pipeline.add_component(instance = splitter, name="document_splitter")
indexing_pipeline.add_component(instance=document_embedder, name="document_embedder")
indexing_pipeline.add_component(instance = writer, name="document_writer")
indexing_pipeline.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
indexing_pipeline.connect("pdf_file_converter", "document_joiner")
indexing_pipeline.connect("document_joiner", "document_cleaner")
indexing_pipeline.connect("document_cleaner", "document_splitter")
indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer")
indexing_pipeline.run({"file_type_router": {"sources": ["./data/3.pdf"]},
})
However, I'm having trouble removing the "References" section before the data is added to the database. Is there a way to include only specific sections from PDFs when populating the vector database?
Hi @NILICK, have you tried using a layout-aware library that could help you do this, like using unstructured.io?
Hi @mrm1001, thanks for your suggestion. I'll try it.