crewAI-tools icon indicating copy to clipboard operation
crewAI-tools copied to clipboard

PDFSearchTool does not limit search to provided PDF (when a PDF is provided)

Open sethcoast opened this issue 8 months ago • 0 comments

Currently, if a PDFSearchTool is instantiated like so: pdf_search = PDFSearchTool(pdf='pdf_2.pdf')

And then a query is made to the vector database (typically a chromadb vector database via embedchain), the results returned will not be limited to the provided pdf IF other documents have already been embedded into the vector database.

Something like this oughta work:

PDFEmbedchainAdapter

An updated version of EmbedchainAdapter which saves the pdf "source" when it is added to the vector database, and uses the "source" when querying the vector database (to filter for only that pdf).

class PDFEmbedchainAdapter(Adapter):
    embedchain_app: App
    summarize: bool = False
    src: Optional[str] = None

    def query(self, question: str) -> str:
        print("Querying pdf from embedchain")
        print("pdf source: ", self.src)
        where = {"app_id": self.embedchain_app.config.id, "source": self.src} if self.src else None
        result, sources = self.embedchain_app.query(
            # todo: this where clause is not working for some reason
            question, citations=True, dry_run=(not self.summarize), where=where
        )
        if self.summarize:
            return result
        return "\n\n".join([source[0] for source in sources])

    def add(
        self,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        print("Adding pdf to embedchain")
        print("pdf source: ", args[0])
        self.src = args[0]
        self.embedchain_app.add(*args, **kwargs)

Update PDFSearchTool to use the updated PDFEmbedChainAdapter

# Vanilla PDFSearchTool which uses the updated PDFEmbedChainAdapter
class PDFSearchTool(RagTool):
    name: str = "Search a PDF's content"
    description: str = (
        "A tool that can be used to semantic search a query from a PDF's content."
    )
    args_schema: Type[BaseModel] = PDFSearchToolSchema

    def __init__(self, pdf: Optional[str] = None, **kwargs):
        super().__init__(**kwargs)
        
        if pdf is not None:
            self.add(pdf)
            self.description = f"A tool that can be used to semantic search a query the {pdf} PDF's content."
            self.args_schema = FixedPDFSearchToolSchema
            self._generate_description()
        
            
    @model_validator(mode="after")
    def _set_default_adapter(self):
        if isinstance(self.adapter, RagTool._AdapterPlaceholder):
            app = App.from_config(config=self.config) if self.config else App()
            self.adapter = PDFEmbedchainAdapter(
                embedchain_app=app, summarize=self.summarize
            )

        return self

    def add(
        self,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        kwargs["data_type"] = DataType.PDF_FILE
        super().add(*args, **kwargs)

    def _before_run(
        self,
        query: str,
        **kwargs: Any,
    ) -> Any:
        if "pdf" in kwargs:
            self.add(kwargs["pdf"])

sethcoast avatar Jun 25 '24 21:06 sethcoast