haystack icon indicating copy to clipboard operation
haystack copied to clipboard

Building a Vector Database from Specific Text

Open NILICK opened this issue 10 months ago • 2 comments

I'm using Haystack 2.0 and Qdrant to create a vector database for PDF files. I wrote the following code to achieve this:

from pathlib import Path
from typing import Dict, Any
from datetime import datetime
from haystack import Pipeline
from haystack.components.others import Multiplexer
from haystack.components.routers import FileTypeRouter
from haystack.components.writers import DocumentWriter
from haystack.components.joiners import DocumentJoiner
from haystack.components.builders import PromptBuilder
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder

document_store = QdrantDocumentStore(host="localhost",
                                     index="Test",
                                     embedding_dim=768,
                                     recreate_index=False, # If you want add another data later, set to "False"
                                     timeout=120,
                                     hnsw_config={"m": 16, "ef_construct": 64}
                                    )

file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
cleaner = DocumentCleaner(
    remove_empty_lines = True,
    remove_extra_whitespaces = True,
    remove_repeated_substrings = True,
    remove_substrings = ["References"],
)

document_joiner = DocumentJoiner()
document_embedder = SentenceTransformersDocumentEmbedder()

splitter = DocumentSplitter(split_by="passage", split_length=200, split_overlap=1) 
writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)

## Add components to the pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=file_type_router, name="file_type_router")
indexing_pipeline.add_component(instance = pdf_converter, name="pdf_file_converter")
indexing_pipeline.add_component(instance=document_joiner, name="document_joiner")
indexing_pipeline.add_component(instance = cleaner, name="document_cleaner")
indexing_pipeline.add_component(instance = splitter, name="document_splitter")
indexing_pipeline.add_component(instance=document_embedder, name="document_embedder")
indexing_pipeline.add_component(instance = writer, name="document_writer")

indexing_pipeline.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
indexing_pipeline.connect("pdf_file_converter", "document_joiner")
indexing_pipeline.connect("document_joiner", "document_cleaner")
indexing_pipeline.connect("document_cleaner", "document_splitter")
indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer")

indexing_pipeline.run({"file_type_router": {"sources": ["./data/3.pdf"]},
                                           })

However, I'm having trouble removing the "References" section before the data is added to the database. Is there a way to include only specific sections from PDFs when populating the vector database?

NILICK avatar Apr 21 '24 07:04 NILICK

Hi @NILICK, have you tried using a layout-aware library that could help you do this, like using unstructured.io?

mrm1001 avatar Apr 26 '24 14:04 mrm1001

Hi @mrm1001, thanks for your suggestion. I'll try it.

NILICK avatar May 03 '24 05:05 NILICK