bug/Error for large pdf files
Describe the bug Get error when processing large pdf file (~5000 pages 41 mg)
Following is the trace 2024-07-01 12:02:22 File "/app/doc_processing.py", line 113, in __extractPdfElements 2024-07-01 12:02:22 return partition_pdf( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/documents/elements.py", line 593, in wrapper 2024-07-01 12:02:22 elements = func(*args, **kwargs) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/file_utils/filetype.py", line 626, in wrapper 2024-07-01 12:02:22 elements = func(*args, **kwargs) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/file_utils/filetype.py", line 582, in wrapper 2024-07-01 12:02:22 elements = func(*args, **kwargs) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/chunking/dispatch.py", line 74, in wrapper 2024-07-01 12:02:22 elements = func(*args, **kwargs) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/partition/pdf.py", line 192, in partition_pdf 2024-07-01 12:02:22 return partition_pdf_or_image( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/partition/pdf.py", line 288, in partition_pdf_or_image 2024-07-01 12:02:22 elements = _partition_pdf_or_image_local( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/utils.py", line 249, in wrapper 2024-07-01 12:02:22 return func(*args, **kwargs) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured/partition/pdf.py", line 537, in _partition_pdf_or_image_local 2024-07-01 12:02:22 inferred_document_layout = process_file_with_model( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/inference/layout.py", line 370, in process_file_with_model 2024-07-01 12:02:22 else DocumentLayout.from_file( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/inference/layout.py", line 77, in from_file 2024-07-01 12:02:22 page = PageLayout.from_image( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/inference/layout.py", line 305, in from_image 2024-07-01 12:02:22 page.get_elements_with_detection_model() 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/inference/layout.py", line 190, in get_elements_with_detection_model 2024-07-01 12:02:22 inferred_layout: List[LayoutElement] = self.detection_model(self.image) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/models/unstructuredmodel.py", line 61, in call 2024-07-01 12:02:22 return super().call(x) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/models/unstructuredmodel.py", line 42, in call 2024-07-01 12:02:22 return self.predict(x) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/models/yolox.py", line 69, in predict 2024-07-01 12:02:22 return self.image_processing(x) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/unstructured_inference/models/yolox.py", line 109, in image_processing 2024-07-01 12:02:22 origin_img = np.array(image) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/PIL/Image.py", line 696, in array_interface 2024-07-01 12:02:22 new["data"] = self.tobytes() 2024-07-01 12:02:22 ^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/PIL/Image.py", line 755, in tobytes 2024-07-01 12:02:22 self.load() 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/PIL/ImageFile.py", line 288, in load 2024-07-01 12:02:22 raise OSError(msg) 2024-07-01 12:02:22 OSError: image file is truncated (2327 bytes not processed) 2024-07-01 12:02:22 2024-07-01 12:02:22 During handling of the above exception, another exception occurred: 2024-07-01 12:02:22 2024-07-01 12:02:22 Traceback (most recent call last): 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 411, in run_asgi 2024-07-01 12:02:22 result = await app( # type: ignore[func-returns-value] 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in call 2024-07-01 12:02:22 return await self.app(scope, receive, send) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/fastapi/applications.py", line 1054, in call 2024-07-01 12:02:22 await super().call(scope, receive, send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/applications.py", line 123, in call 2024-07-01 12:02:22 await self.middleware_stack(scope, receive, send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/middleware/errors.py", line 186, in call 2024-07-01 12:02:22 raise exc 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/middleware/errors.py", line 164, in call 2024-07-01 12:02:22 await self.app(scope, receive, _send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/middleware/exceptions.py", line 65, in call 2024-07-01 12:02:22 await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 2024-07-01 12:02:22 raise exc 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 2024-07-01 12:02:22 await app(scope, receive, sender) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/routing.py", line 756, in call 2024-07-01 12:02:22 await self.middleware_stack(scope, receive, send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/routing.py", line 776, in app 2024-07-01 12:02:22 await route.handle(scope, receive, send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/routing.py", line 297, in handle 2024-07-01 12:02:22 await self.app(scope, receive, send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/routing.py", line 77, in app 2024-07-01 12:02:22 await wrap_app_handling_exceptions(app, request)(scope, receive, send) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 2024-07-01 12:02:22 raise exc 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 2024-07-01 12:02:22 await app(scope, receive, sender) 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/routing.py", line 72, in app 2024-07-01 12:02:22 response = await func(request) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/fastapi/routing.py", line 278, in app 2024-07-01 12:02:22 raw_response = await run_endpoint_function( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/fastapi/routing.py", line 193, in run_endpoint_function 2024-07-01 12:02:22 return await run_in_threadpool(dependant.call, **values) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/starlette/concurrency.py", line 42, in run_in_threadpool 2024-07-01 12:02:22 return await anyio.to_thread.run_sync(func, *args) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/anyio/to_thread.py", line 56, in run_sync 2024-07-01 12:02:22 return await get_async_backend().run_sync_in_worker_thread( 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread 2024-07-01 12:02:22 return await future 2024-07-01 12:02:22 ^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/site-packages/anyio/_backends/_asyncio.py", line 859, in run 2024-07-01 12:02:22 result = context.run(func, *args) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/app/main.py", line 267, in handle_request 2024-07-01 12:02:22 raise HTTPException(status_code=404, detail=json.dumps(e)) 2024-07-01 12:02:22 ^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/json/init.py", line 231, in dumps 2024-07-01 12:02:22 return _default_encoder.encode(obj) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/json/encoder.py", line 200, in encode 2024-07-01 12:02:22 chunks = self.iterencode(o, _one_shot=True) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/json/encoder.py", line 258, in iterencode 2024-07-01 12:02:22 return _iterencode(o, 0) 2024-07-01 12:02:22 ^^^^^^^^^^^^^^^^^ 2024-07-01 12:02:22 File "/usr/lib/python3.11/json/encoder.py", line 180, in default 2024-07-01 12:02:22 raise TypeError(f'Object of type {o.class.name} ' 2024-07-01 12:02:22 TypeError: Object of type OSError is not JSON serializable
To Reproduce Use a large pdf file
Expected behavior Expected to work for large files
Does adding
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
taken from this stackoverflow answer help? I had a similar problem, and this solved it, though I have not been able to recreate the bug without this solution since.
@magallardo Can u confirm this coz I am facing same issue.
Closing as inactive, assumed resolved. If you still having trouble and can provide a specimen file that reproduces the error we can reopen.