langchain
langchain copied to clipboard
Two different document loaders for Microsoft Word files
Hello, I've noticed that after the latest commit of @MthwRobinson there are two different modules to load Word documents, could they be unified in a single version? Also there are two notebooks that do almost the same thing.
microsoft_word.ipynb and word_document.ipynb
Or am I just missing something?
I agree!
There doesn't seem to be much to unify:
docx
is only working with docx while the word
version also supports doc
, so docx
is obsolete.
I'm unable to load all the word files present in the folder. Below is the code
txt_loader = DirectoryLoader(folder_path, glob="./*.docx")
documents += txt_loader.load()
but below code is working only for 1 file.
txt_loader = UnstructuredWordDocumentLoader("Q&A with PDF & Txt\Documents\Data Scientist.docx", mode="elements")
documents += txt_loader.load()
is there any method to overcome this?
@nithinreddyyyyyy - That seems like it should work. Could you post in here your unstructured
version and the error message you're getting? You can also pass the UnstructuredWordDocumentLoader
explicitly to the DirectoryLoader
like this:
loader = DirectoryLoader(folder_path, glob="./*.docx", loader_cls=UnstructuredWordDocumentLoader)
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain import OpenAI, VectorDBQA
import pickle
import textwrap
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader, UnstructuredFileLoader
import os
import warnings
warnings.filterwarnings("ignore")
# Set up the environment variable for the OpenAI API key
os.environ["OPENAI_API_KEY"] = ""
def get_documents(folder_path, file_extension):
documents = []
if file_extension == 'pdf':
pdf_loader = DirectoryLoader(folder_path, glob="./*.pdf", loader_cls=PyPDFLoader) # Select PDF files
documents += pdf_loader.load()
elif file_extension == 'txt':
txt_loader = DirectoryLoader(folder_path, glob="./*.txt") # Select TXT files
documents += txt_loader.load()
elif file_extension == 'docx':
docx_loader = DirectoryLoader(folder_path, glob="./*.docx", loader_cls=UnstructuredWordDocumentLoader)
documents += docx_loader.load()
elif file_extension == 'combined':
pdf_loader = DirectoryLoader(folder_path, glob="./*.pdf", loader_cls=PyPDFLoader) # Select PDF files
documents += pdf_loader.load()
txt_loader = DirectoryLoader(folder_path, glob="./*.txt") # Select TXT files
documents += txt_loader.load()
else:
return None
return documents
def get_query_result(query, documents):
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
# Query documents
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])
docsearch = Chroma.from_documents(texts, embeddings)
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
result = qa({"query": query})
result_text = result['result'].strip()
source = result.get('source_documents', [{}])[0].metadata.get('source', '')
page = result.get('source_documents', [{}])[0].metadata.get('page', '')
return result_text, source, page
def chat_loop(file_extension, folder_path):
documents = get_documents(folder_path, file_extension)
if documents is None:
print("Invalid folder path or no supported files found.")
return
while True:
query = input("Enter your query (type 'exit' to end): ")
if query.lower() == 'exit':
break
result = get_query_result(query, documents)
if result is not None:
result_text, source, page = result
print("Result:", result_text)
if source:
print("Source:", source)
print("Page:", page)
else:
print("No answer found for the query.")
print() # Print an empty line for separation
# Get the selected file extension and folder path from the webpage
selected_file_extension = 'docx' # Replace with the value obtained from the dropdown
folder_path = 'Documents' # Replace with the folder path obtained from the user input on the webpage
# Start the chat loop
chat_loop(selected_file_extension, folder_path)
above is the code, below is the error
NotEnoughElementsException Traceback (most recent call last)
Cell In[8], line 85
82 folder_path = 'Documents' # Replace with the folder path obtained from the user input on the webpage
84 # Start the chat loop
---> 85 chat_loop(selected_file_extension, folder_path)
Cell In[8], line 67, in chat_loop(file_extension, folder_path)
64 if query.lower() == 'exit':
65 break
---> 67 result = get_query_result(query, documents)
69 if result is not None:
70 result_text, source, page = result
Cell In[8], line 48, in get_query_result(query, documents)
46 docsearch = Chroma.from_documents(texts, embeddings)
47 qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
---> 48 result = qa({"query": query})
50 result_text = result['result'].strip()
51 source = result.get('source_documents', [{}])[0].metadata.get('source', '')
File ~\anaconda3\lib\site-packages\langchain\chains\base.py:116, in Chain.__call__(self, inputs, return_only_outputs)
114 except (KeyboardInterrupt, Exception) as e:
115 self.callback_manager.on_chain_error(e, verbose=self.verbose)
--> 116 raise e
117 self.callback_manager.on_chain_end(outputs, verbose=self.verbose)
118 return self.prep_outputs(inputs, outputs, return_only_outputs)
File ~\anaconda3\lib\site-packages\langchain\chains\base.py:113, in Chain.__call__(self, inputs, return_only_outputs)
107 self.callback_manager.on_chain_start(
108 {"name": self.__class__.__name__},
109 inputs,
110 verbose=self.verbose,
111 )
112 try:
--> 113 outputs = self._call(inputs)
114 except (KeyboardInterrupt, Exception) as e:
115 self.callback_manager.on_chain_error(e, verbose=self.verbose)
File ~\anaconda3\lib\site-packages\langchain\chains\retrieval_qa\base.py:109, in BaseRetrievalQA._call(self, inputs)
96 """Run get_relevant_text and llm on input query.
97
98 If chain has 'return_source_documents' as 'True', returns
(...)
105 answer, docs = res['result'], res['source_documents']
106 """
107 question = inputs[self.input_key]
--> 109 docs = self._get_docs(question)
110 answer = self.combine_documents_chain.run(
111 input_documents=docs, question=question
112 )
114 if self.return_source_documents:
File ~\anaconda3\lib\site-packages\langchain\chains\retrieval_qa\base.py:203, in VectorDBQA._get_docs(self, question)
201 def _get_docs(self, question: str) -> List[Document]:
202 if self.search_type == "similarity":
--> 203 docs = self.vectorstore.similarity_search(
204 question, k=self.k, **self.search_kwargs
205 )
206 elif self.search_type == "mmr":
207 docs = self.vectorstore.max_marginal_relevance_search(
208 question, k=self.k, **self.search_kwargs
209 )
File ~\anaconda3\lib\site-packages\langchain\vectorstores\chroma.py:144, in Chroma.similarity_search(self, query, k, filter, **kwargs)
127 def similarity_search(
128 self,
129 query: str,
(...)
132 **kwargs: Any,
133 ) -> List[Document]:
134 """Run similarity search with Chroma.
135
136 Args:
(...)
142 List[Document]: List of documents most similar to the query text.
143 """
--> 144 docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
145 return [doc for doc, _ in docs_and_scores]
File ~\anaconda3\lib\site-packages\langchain\vectorstores\chroma.py:190, in Chroma.similarity_search_with_score(self, query, k, filter, **kwargs)
188 else:
189 query_embedding = self._embedding_function.embed_query(query)
--> 190 results = self._collection.query(
191 query_embeddings=[query_embedding], n_results=k, where=filter
192 )
194 return _results_to_docs_and_scores(results)
File ~\anaconda3\lib\site-packages\chromadb\api\models\Collection.py:219, in Collection.query(self, query_embeddings, query_texts, n_results, where, where_document, include)
216 if where_document is None:
217 where_document = {}
--> 219 return self._client._query(
220 collection_name=self.name,
221 query_embeddings=query_embeddings,
222 n_results=n_results,
223 where=where,
224 where_document=where_document,
225 include=include,
226 )
File ~\anaconda3\lib\site-packages\chromadb\api\local.py:408, in LocalAPI._query(self, collection_name, query_embeddings, n_results, where, where_document, include)
399 def _query(
400 self,
401 collection_name,
(...)
406 include: Include = ["documents", "metadatas", "distances"],
407 ):
--> 408 uuids, distances = self._db.get_nearest_neighbors(
409 collection_name=collection_name,
410 where=where,
411 where_document=where_document,
412 embeddings=query_embeddings,
413 n_results=n_results,
414 )
416 include_embeddings = "embeddings" in include
417 include_documents = "documents" in include
File ~\anaconda3\lib\site-packages\chromadb\db\clickhouse.py:583, in Clickhouse.get_nearest_neighbors(self, where, where_document, embeddings, n_results, collection_name, collection_uuid)
580 ids = None
582 index = self._index(collection_uuid)
--> 583 uuids, distances = index.get_nearest_neighbors(embeddings, n_results, ids)
585 return uuids, distances
File ~\anaconda3\lib\site-packages\chromadb\db\index\hnswlib.py:238, in Hnswlib.get_nearest_neighbors(self, query, k, ids)
235 self._check_dimensionality(query)
237 if k > self._index_metadata["elements"]:
--> 238 raise NotEnoughElementsException(
239 f"Number of requested results {k} cannot be greater than number of elements in index {self._index_metadata['elements']}"
240 )
242 s2 = time.time()
243 # get ids from uuids as a set, if they are available
NotEnoughElementsException: Number of requested results 4 cannot be greater than number of elements in index 3
I don't why it is returning error
I didn't see unstructured
or document_loaders
(unless I missed it). Seems like the exception is coming ChromaDB. Wondering if the source of this error might be in the Chroma vectorstore
or somewhere else. May be worthwhile to spin off a separate issue for this and flag it for someone more familiar with the vectorstores.