crewAI-tools
crewAI-tools copied to clipboard
PDFSearchTool does not limit search to provided PDF (when a PDF is provided)
Currently, if a PDFSearchTool is instantiated like so:
pdf_search = PDFSearchTool(pdf='pdf_2.pdf')
And then a query is made to the vector database (typically a chromadb vector database via embedchain), the results returned will not be limited to the provided pdf IF other documents have already been embedded into the vector database.
Something like this oughta work:
PDFEmbedchainAdapter
An updated version of EmbedchainAdapter which saves the pdf "source" when it is added to the vector database, and uses the "source" when querying the vector database (to filter for only that pdf).
class PDFEmbedchainAdapter(Adapter):
embedchain_app: App
summarize: bool = False
src: Optional[str] = None
def query(self, question: str) -> str:
print("Querying pdf from embedchain")
print("pdf source: ", self.src)
where = {"app_id": self.embedchain_app.config.id, "source": self.src} if self.src else None
result, sources = self.embedchain_app.query(
# todo: this where clause is not working for some reason
question, citations=True, dry_run=(not self.summarize), where=where
)
if self.summarize:
return result
return "\n\n".join([source[0] for source in sources])
def add(
self,
*args: Any,
**kwargs: Any,
) -> None:
print("Adding pdf to embedchain")
print("pdf source: ", args[0])
self.src = args[0]
self.embedchain_app.add(*args, **kwargs)
Update PDFSearchTool to use the updated PDFEmbedChainAdapter
# Vanilla PDFSearchTool which uses the updated PDFEmbedChainAdapter
class PDFSearchTool(RagTool):
name: str = "Search a PDF's content"
description: str = (
"A tool that can be used to semantic search a query from a PDF's content."
)
args_schema: Type[BaseModel] = PDFSearchToolSchema
def __init__(self, pdf: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if pdf is not None:
self.add(pdf)
self.description = f"A tool that can be used to semantic search a query the {pdf} PDF's content."
self.args_schema = FixedPDFSearchToolSchema
self._generate_description()
@model_validator(mode="after")
def _set_default_adapter(self):
if isinstance(self.adapter, RagTool._AdapterPlaceholder):
app = App.from_config(config=self.config) if self.config else App()
self.adapter = PDFEmbedchainAdapter(
embedchain_app=app, summarize=self.summarize
)
return self
def add(
self,
*args: Any,
**kwargs: Any,
) -> None:
kwargs["data_type"] = DataType.PDF_FILE
super().add(*args, **kwargs)
def _before_run(
self,
query: str,
**kwargs: Any,
) -> Any:
if "pdf" in kwargs:
self.add(kwargs["pdf"])