chatdocs
chatdocs copied to clipboard
Program crashes when Pandoc encounters an error.
Error handling should be improved when adding documents. At the moment when Pandoc encounters a corrupt file, the entire chatdocs program crashes instead of skipping the corrupt file.
This is the specific error:
Loading new documents: 25%|████▌ | 251/985 [08:13<24:01, 1.96s/it]
RemoteTraceback:
"""
Traceback (most recent call last):
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/chatdocs/add.py", line 74, in load_single_document
return loader.load()
^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/langchain/document_loaders/unstructured.py", line 71, in load
elements = self._get_elements()
^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/langchain/document_loaders/epub.py", line 22, in _get_elements
return partition_epub(filename=self.file_path, **self.unstructured_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/unstructured/file_utils/filetype.py", line 365, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/unstructured/partition/epub.py", line 26, in partition_epub
return convert_and_partition_html(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/unstructured/partition/html.py", line 119, in convert_and_partition_html
html_text = convert_file_to_html_text(source_format=source_format, filename=filename, file=file)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/unstructured/file_utils/file_conversion.py", line 44, in convert_file_to_html_text
html_text = convert_file_to_text(
^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/unstructured/file_utils/file_conversion.py", line 12, in convert_file_to_text
text = pypandoc.convert_file(filename, target_format, format=source_format)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/pypandoc/__init__.py", line 168, in convert_file
return _convert_input(discovered_source_files, format, 'path', to, extra_args=extra_args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/pypandoc/__init__.py", line 426, in _convert_input
raise RuntimeError(
RuntimeError: Pandoc died with exitcode "64" during conversion: Couldn't extract ePub file: not enough bytes
"""
The above exception was the direct cause of the following exception:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages/chatdocs/main.py:40 in add │
│ │
│ 37 │ from .add import add │
│ 38 │ │
│ 39 │ config = get_config(config) │
│ ❱ 40 │ add(config=config, source_directory=str(directory)) │
│ 41 │
│ 42 │
│ 43 @app.command() │
│ │