markitdown icon indicating copy to clipboard operation
markitdown copied to clipboard

FileConversionException: Could not convert ''

Open guilhemvermorel opened this issue 11 months ago • 2 comments

Hi,

When I try to extract information from some pdf from DocLayNet dataset with md.convert, I get this error :

---------------------------------------------------------------------------
FileConversionException                   Traceback (most recent call last)
Cell In[3], [line 45](vscode-notebook-cell:?execution_count=3&line=45)
     [41](vscode-notebook-cell:?execution_count=3&line=41) page_hash = doc[:doc.find('.pdf')]
     [44](vscode-notebook-cell:?execution_count=3&line=44) start_time = time.time()
---> [45](vscode-notebook-cell:?execution_count=3&line=45) conv_result = md.convert(str(doc_path))
     [46](vscode-notebook-cell:?execution_count=3&line=46) diff_time = time.time() - start_time
     [47](vscode-notebook-cell:?execution_count=3&line=47) print(f"Time computing : {diff_time} s")

File c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\markitdown\_markitdown.py:1094, in MarkItDown.convert(self, source, **kwargs)
   [1092](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1092)         return self.convert_url(source, **kwargs)
   [1093](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1093)     else:
-> [1094](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1094)         return self.convert_local(source, **kwargs)
   [1095](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1095) # Request response
   [1096](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1096) elif isinstance(source, requests.Response):

File c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\markitdown\_markitdown.py:1114, in MarkItDown.convert_local(self, path, **kwargs)
   [1111](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1111)     self._append_ext(extensions, g)
   [1113](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1113) # Convert
-> [1114](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1114) return self._convert(path, extensions, **kwargs)

File c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\markitdown\_markitdown.py:1255, in MarkItDown._convert(self, local_path, extensions, **kwargs)
   [1253](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1253) # If we got this far without success, report any exceptions
   [1254](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1254) if len(error_trace) > 0:
-> [1255](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1255)     raise FileConversionException(
   [1256](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1256)         f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
   [1257](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1257)     )
   [1259](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1259) # Nothing can handle it!
   [1260](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1260) raise UnsupportedFormatException(
   [1261](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1261)     f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
   [1262](file:///C:/Users/AppData/Local/miniconda3/envs/myenv/Lib/site-packages/markitdown/_markitdown.py:1262) )

FileConversionException: Could not convert 'E:\users\.cache\huggingface\hub\datasets--pierreguillou--DocLayNet-large\snapshots\38ff443244c1b496c33ed237d3d4468daf24265c\data\part_dataset_3\part_dataset_3\test\pdfs\ccbe08f3390d47046dbb9d4c839788ba05a0f5e139ab6931a06e8304247c54f0.pdf' to Markdown. File type was recognized as ['.pdf', '.pdf', '.fdf']. While converting the file, the following error was encountered:

Traceback (most recent call last):
  File "c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\markitdown\_markitdown.py", line 1239, in _convert
    res = converter.convert(local_path, **_kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\markitdown\_markitdown.py", line 490, in convert
    text_content=pdfminer.high_level.extract_text(local_path),
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\pdfminer\high_level.py", line 169, in extract_text
    for page in PDFPage.get_pages(
                ^^^^^^^^^^^^^^^^^^
  File "c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\pdfminer\pdfpage.py", line 171, in get_pages
    for (pageno, page) in enumerate(cls.create_pages(doc)):
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\pdfminer\pdfpage.py", line 127, in create_pages
    yield cls(document, objid, tree, next(page_labels))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\AppData\Local\miniconda3\envs\myenv\Lib\site-packages\pdfminer\pdfpage.py", line 64, in __init__
    resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"]
                                                   ~~~~~~~~~~^^^^^^^^^^^^
TypeError: 'PDFObjRef' object is not iterable

Is someone have already encountered this issue? It's really strange because the document is not an .fdf file but a .pdf one.

guilhemvermorel avatar Jan 13 '25 11:01 guilhemvermorel

please give us some sample data we will also test in our end and will try to update you

aviral-bhardwaj avatar Jan 15 '25 07:01 aviral-bhardwaj