unstructured
unstructured copied to clipboard
bug/ UnidentifiedImageError with partition_pdf()
from typing import Any
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
raw_pdf_elements = partition_pdf(
filename="some_pdf.pdf",
extract_images_in_pdf=False,
infer_table_structure=True,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
image_output_dir_path=".",
)
Running this function with infer_table_structure
variable to true, it prompts error:
UnidentifiedImageError: cannot identify image file '/tmp/tmp2uteptq8/dcd48c17-f12a-4303-9bce-cf3ca2064ae0-1.ppm'
Here is the complete log of the error message:
UnidentifiedImageError Traceback (most recent call last)
[<ipython-input-41-a835b92fbe3b>](https://localhost:8080/#) in <cell line: 7>()
5
6
----> 7 raw_pdf_elements = partition_pdf(
8 filename="/content/drive/MyDrive/Zexin_Xu_CV_1130.pdf",
9 extract_images_in_pdf=False,
10 frames
[/usr/local/lib/python3.10/dist-packages/unstructured/documents/elements.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
524 @functools.wraps(func)
525 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 526 elements = func(*args, **kwargs)
527 sig = inspect.signature(func)
528 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
[/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
617 @functools.wraps(func)
618 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 619 elements = func(*args, **kwargs)
620 sig = inspect.signature(func)
621 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
[/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
572 @functools.wraps(func)
573 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 574 elements = func(*args, **kwargs)
575 sig = inspect.signature(func)
576 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
[/usr/local/lib/python3.10/dist-packages/unstructured/chunking/__init__.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
67
68 # -- call the partitioning function to get the elements --
---> 69 elements = func(*args, **kwargs)
70
71 # -- look for a chunking-strategy argument and run the indicated chunker when present --
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py](https://localhost:8080/#) in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, include_metadata, metadata_filename, metadata_last_modified, chunking_strategy, links, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, **kwargs)
211 languages = check_language_args(languages or [], ocr_languages) or ["eng"]
212
--> 213 return partition_pdf_or_image(
214 filename=filename,
215 file=file,
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py](https://localhost:8080/#) in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, **kwargs)
540 with warnings.catch_warnings():
541 warnings.simplefilter("ignore")
--> 542 elements = _partition_pdf_or_image_local(
543 filename=filename,
544 file=spooled_to_bytes_io_if_needed(file),
[/usr/local/lib/python3.10/dist-packages/unstructured/utils.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
212 ),
213 )
--> 214 return func(*args, **kwargs)
215
216 return wrapper
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py](https://localhost:8080/#) in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, **kwargs)
312
313 if file is None:
--> 314 inferred_document_layout = process_file_with_model(
315 filename,
316 is_image=is_image,
[/usr/local/lib/python3.10/dist-packages/unstructured_inference/inference/layout.py](https://localhost:8080/#) in process_file_with_model(filename, model_name, is_image, fixed_layouts, pdf_image_dpi, **kwargs)
392 )
393 if is_image
--> 394 else DocumentLayout.from_file(
395 filename,
396 detection_model=detection_model,
[/usr/local/lib/python3.10/dist-packages/unstructured_inference/inference/layout.py](https://localhost:8080/#) in from_file(cls, filename, fixed_layouts, pdf_image_dpi, **kwargs)
76 # NOTE(robinson) - In the future, maybe we detect the page number and default
77 # to the index if it is not detected
---> 78 with Image.open(image_path) as image:
79 page = PageLayout.from_image(
80 image,
[/usr/local/lib/python3.10/dist-packages/PIL/Image.py](https://localhost:8080/#) in open(fp, mode, formats)
3281 continue
3282 except BaseException:
-> 3283 if exclusive_fp:
3284 fp.close()
3285 raise
It seems like the error coming from the usage of PIL, any idea of fixing this?
Hi @Asonjay, can you please share the document you're trying?
I have tried https://arxiv.org/pdf/2304.08485.pdf and https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/381953f9-934e-4cc8-b099-144910676bad.pdf
Hi @Asonjay, can you please share the document you're trying?
Any updates?
same issue, any update?
Hi @Asonjay @andrea-mucci
What versions of unstructured
and unstructured-inference
libraries are you using? When I tried to reproduce the error, It worked on both 2304.08485.pdf and CIK-0001045810/381953f9-934e-4cc8-b099-144910676bad.pdf documents with the following versions:
unstructured==0.12.5
unstructured-inference==0.7.24
Closes as assumed resolved. @Asonjay @andrea-mucci feel free to reopen if you're still having trouble.
@christinestraub faced the same issue on my pdf but can confirm it worked for me when i imported the mentioned versions. Thank you! :+1: