langchain icon indicating copy to clipboard operation
langchain copied to clipboard

Two different document loaders for Microsoft Word files

Open klein-t opened this issue 1 year ago • 1 comments

Hello, I've noticed that after the latest commit of @MthwRobinson there are two different modules to load Word documents, could they be unified in a single version? Also there are two notebooks that do almost the same thing.

docx.py and word_document.py

microsoft_word.ipynb and word_document.ipynb

Or am I just missing something?

klein-t avatar Mar 16 '23 18:03 klein-t

I agree! There doesn't seem to be much to unify: docx is only working with docx while the word version also supports doc, so docx is obsolete.

Janldeboer avatar Mar 18 '23 03:03 Janldeboer

I'm unable to load all the word files present in the folder. Below is the code

txt_loader = DirectoryLoader(folder_path, glob="./*.docx")
documents += txt_loader.load()

but below code is working only for 1 file.

txt_loader = UnstructuredWordDocumentLoader("Q&A with PDF & Txt\Documents\Data Scientist.docx", mode="elements") 
documents += txt_loader.load()

is there any method to overcome this?

nithinreddyyyyyy avatar May 17 '23 11:05 nithinreddyyyyyy

@nithinreddyyyyyy - That seems like it should work. Could you post in here your unstructured version and the error message you're getting? You can also pass the UnstructuredWordDocumentLoader explicitly to the DirectoryLoader like this:

loader = DirectoryLoader(folder_path, glob="./*.docx", loader_cls=UnstructuredWordDocumentLoader)

MthwRobinson avatar May 17 '23 14:05 MthwRobinson

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain import OpenAI, VectorDBQA
import pickle
import textwrap
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader, UnstructuredFileLoader
import os
import warnings
warnings.filterwarnings("ignore")

# Set up the environment variable for the OpenAI API key
os.environ["OPENAI_API_KEY"] = ""

def get_documents(folder_path, file_extension):
    documents = []
    if file_extension == 'pdf':
        pdf_loader = DirectoryLoader(folder_path, glob="./*.pdf", loader_cls=PyPDFLoader)  # Select PDF files
        documents += pdf_loader.load()
    elif file_extension == 'txt':
        txt_loader = DirectoryLoader(folder_path, glob="./*.txt")  # Select TXT files
        documents += txt_loader.load()
    elif file_extension == 'docx':
        docx_loader = DirectoryLoader(folder_path, glob="./*.docx", loader_cls=UnstructuredWordDocumentLoader)
        documents += docx_loader.load()
    elif file_extension == 'combined':
        pdf_loader = DirectoryLoader(folder_path, glob="./*.pdf", loader_cls=PyPDFLoader)  # Select PDF files
        documents += pdf_loader.load()
        txt_loader = DirectoryLoader(folder_path, glob="./*.txt")  # Select TXT files
        documents += txt_loader.load()
    else:
        return None

    return documents

def get_query_result(query, documents):
    # Split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    # Query documents
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])
    docsearch = Chroma.from_documents(texts, embeddings)
    qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
    result = qa({"query": query})

    result_text = result['result'].strip()
    source = result.get('source_documents', [{}])[0].metadata.get('source', '')
    page = result.get('source_documents', [{}])[0].metadata.get('page', '')

    return result_text, source, page

def chat_loop(file_extension, folder_path):
    documents = get_documents(folder_path, file_extension)
    if documents is None:
        print("Invalid folder path or no supported files found.")
        return

    while True:
        query = input("Enter your query (type 'exit' to end): ")
        if query.lower() == 'exit':
            break

        result = get_query_result(query, documents)

        if result is not None:
            result_text, source, page = result
            print("Result:", result_text)
            if source:
                print("Source:", source)
                print("Page:", page)
        else:
            print("No answer found for the query.")

        print()  # Print an empty line for separation

# Get the selected file extension and folder path from the webpage
selected_file_extension = 'docx'  # Replace with the value obtained from the dropdown
folder_path = 'Documents'  # Replace with the folder path obtained from the user input on the webpage

# Start the chat loop
chat_loop(selected_file_extension, folder_path)

above is the code, below is the error

NotEnoughElementsException                Traceback (most recent call last)
Cell In[8], line 85
     82 folder_path = 'Documents'  # Replace with the folder path obtained from the user input on the webpage
     84 # Start the chat loop
---> 85 chat_loop(selected_file_extension, folder_path)

Cell In[8], line 67, in chat_loop(file_extension, folder_path)
     64 if query.lower() == 'exit':
     65     break
---> 67 result = get_query_result(query, documents)
     69 if result is not None:
     70     result_text, source, page = result

Cell In[8], line 48, in get_query_result(query, documents)
     46 docsearch = Chroma.from_documents(texts, embeddings)
     47 qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
---> 48 result = qa({"query": query})
     50 result_text = result['result'].strip()
     51 source = result.get('source_documents', [{}])[0].metadata.get('source', '')

File ~\anaconda3\lib\site-packages\langchain\chains\base.py:116, in Chain.__call__(self, inputs, return_only_outputs)
    114 except (KeyboardInterrupt, Exception) as e:
    115     self.callback_manager.on_chain_error(e, verbose=self.verbose)
--> 116     raise e
    117 self.callback_manager.on_chain_end(outputs, verbose=self.verbose)
    118 return self.prep_outputs(inputs, outputs, return_only_outputs)

File ~\anaconda3\lib\site-packages\langchain\chains\base.py:113, in Chain.__call__(self, inputs, return_only_outputs)
    107 self.callback_manager.on_chain_start(
    108     {"name": self.__class__.__name__},
    109     inputs,
    110     verbose=self.verbose,
    111 )
    112 try:
--> 113     outputs = self._call(inputs)
    114 except (KeyboardInterrupt, Exception) as e:
    115     self.callback_manager.on_chain_error(e, verbose=self.verbose)

File ~\anaconda3\lib\site-packages\langchain\chains\retrieval_qa\base.py:109, in BaseRetrievalQA._call(self, inputs)
     96 """Run get_relevant_text and llm on input query.
     97 
     98 If chain has 'return_source_documents' as 'True', returns
   (...)
    105 answer, docs = res['result'], res['source_documents']
    106 """
    107 question = inputs[self.input_key]
--> 109 docs = self._get_docs(question)
    110 answer = self.combine_documents_chain.run(
    111     input_documents=docs, question=question
    112 )
    114 if self.return_source_documents:

File ~\anaconda3\lib\site-packages\langchain\chains\retrieval_qa\base.py:203, in VectorDBQA._get_docs(self, question)
    201 def _get_docs(self, question: str) -> List[Document]:
    202     if self.search_type == "similarity":
--> 203         docs = self.vectorstore.similarity_search(
    204             question, k=self.k, **self.search_kwargs
    205         )
    206     elif self.search_type == "mmr":
    207         docs = self.vectorstore.max_marginal_relevance_search(
    208             question, k=self.k, **self.search_kwargs
    209         )

File ~\anaconda3\lib\site-packages\langchain\vectorstores\chroma.py:144, in Chroma.similarity_search(self, query, k, filter, **kwargs)
    127 def similarity_search(
    128     self,
    129     query: str,
   (...)
    132     **kwargs: Any,
    133 ) -> List[Document]:
    134     """Run similarity search with Chroma.
    135 
    136     Args:
   (...)
    142         List[Document]: List of documents most similar to the query text.
    143     """
--> 144     docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
    145     return [doc for doc, _ in docs_and_scores]

File ~\anaconda3\lib\site-packages\langchain\vectorstores\chroma.py:190, in Chroma.similarity_search_with_score(self, query, k, filter, **kwargs)
    188 else:
    189     query_embedding = self._embedding_function.embed_query(query)
--> 190     results = self._collection.query(
    191         query_embeddings=[query_embedding], n_results=k, where=filter
    192     )
    194 return _results_to_docs_and_scores(results)

File ~\anaconda3\lib\site-packages\chromadb\api\models\Collection.py:219, in Collection.query(self, query_embeddings, query_texts, n_results, where, where_document, include)
    216 if where_document is None:
    217     where_document = {}
--> 219 return self._client._query(
    220     collection_name=self.name,
    221     query_embeddings=query_embeddings,
    222     n_results=n_results,
    223     where=where,
    224     where_document=where_document,
    225     include=include,
    226 )

File ~\anaconda3\lib\site-packages\chromadb\api\local.py:408, in LocalAPI._query(self, collection_name, query_embeddings, n_results, where, where_document, include)
    399 def _query(
    400     self,
    401     collection_name,
   (...)
    406     include: Include = ["documents", "metadatas", "distances"],
    407 ):
--> 408     uuids, distances = self._db.get_nearest_neighbors(
    409         collection_name=collection_name,
    410         where=where,
    411         where_document=where_document,
    412         embeddings=query_embeddings,
    413         n_results=n_results,
    414     )
    416     include_embeddings = "embeddings" in include
    417     include_documents = "documents" in include

File ~\anaconda3\lib\site-packages\chromadb\db\clickhouse.py:583, in Clickhouse.get_nearest_neighbors(self, where, where_document, embeddings, n_results, collection_name, collection_uuid)
    580     ids = None
    582 index = self._index(collection_uuid)
--> 583 uuids, distances = index.get_nearest_neighbors(embeddings, n_results, ids)
    585 return uuids, distances

File ~\anaconda3\lib\site-packages\chromadb\db\index\hnswlib.py:238, in Hnswlib.get_nearest_neighbors(self, query, k, ids)
    235 self._check_dimensionality(query)
    237 if k > self._index_metadata["elements"]:
--> 238     raise NotEnoughElementsException(
    239         f"Number of requested results {k} cannot be greater than number of elements in index {self._index_metadata['elements']}"
    240     )
    242 s2 = time.time()
    243 # get ids from uuids as a set, if they are available

NotEnoughElementsException: Number of requested results 4 cannot be greater than number of elements in index 3

I don't why it is returning error

nithinreddyyyyyy avatar May 17 '23 14:05 nithinreddyyyyyy

I didn't see unstructured or document_loaders (unless I missed it). Seems like the exception is coming ChromaDB. Wondering if the source of this error might be in the Chroma vectorstore or somewhere else. May be worthwhile to spin off a separate issue for this and flag it for someone more familiar with the vectorstores.

MthwRobinson avatar May 17 '23 16:05 MthwRobinson