unstructured
unstructured copied to clipboard
LocalEntryNotFoundError
from typing import Any
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
# Path to save images
path = "/home/zhanglv/code/small-project/data/"
filename =path + "LLaVA.pdf"
# Get elements
raw_pdf_elements = partition_pdf(
filename=path + "LLaVA.pdf",
# Using pdf format to find embedded image blocks
extract_images_in_pdf=True,
# Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
# Titles are any sub-section of the document
infer_table_structure=True,
# Post processing to aggregate text once we have the title
chunking_strategy="by_title",
url=None,
# Chunking params to aggregate text blocks
# Attempt to create a new chunk 3800 chars
# Attempt to keep chunks > 2000 chars
# Hard max on chunks
# max_characters=4000,
# new_after_n_chars=3800,
# combine_text_under_n_chars=2000,
# image_output_dir_path=path,
)
executed in 18.9s, finished 00:08:42 2023-11-25
---------------------------------------------------------------------------
LocalEntryNotFoundError Traceback (most recent call last)
Cell In[1], line 10
8 filename =path + "LLaVA.pdf"
9 # Get elements
---> 10 raw_pdf_elements = partition_pdf(
11 filename=path + "LLaVA.pdf",
12 # Using pdf format to find embedded image blocks
13 extract_images_in_pdf=True,
14 # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
15 # Titles are any sub-section of the document
16 infer_table_structure=True,
17 # Post processing to aggregate text once we have the title
18 chunking_strategy="by_title",
19 url=None,
20 # Chunking params to aggregate text blocks
21 # Attempt to create a new chunk 3800 chars
22 # Attempt to keep chunks > 2000 chars
23 # Hard max on chunks
24 # max_characters=4000,
25 # new_after_n_chars=3800,
26 # combine_text_under_n_chars=2000,
27 # image_output_dir_path=path,
28 )
File ~/venv/small-project/lib/python3.8/site-packages/unstructured/documents/elements.py:306, in process_metadata.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
304 @functools.wraps(func)
305 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 306 elements = func(*args, **kwargs)
307 sig = inspect.signature(func)
308 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
File ~/venv/small-project/lib/python3.8/site-packages/unstructured/file_utils/filetype.py:551, in add_metadata_with_filetype.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
549 @functools.wraps(func)
550 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 551 elements = func(*args, **kwargs)
552 sig = inspect.signature(func)
553 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
File ~/venv/small-project/lib/python3.8/site-packages/unstructured/chunking/title.py:277, in add_chunking_strategy.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
275 @functools.wraps(func)
276 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 277 elements = func(*args, **kwargs)
278 sig = inspect.signature(func)
279 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
File ~/venv/small-project/lib/python3.8/site-packages/unstructured/partition/pdf.py:157, in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, max_partition, min_partition, include_metadata, metadata_filename, metadata_last_modified, chunking_strategy, links, **kwargs)
151 languages = convert_old_ocr_languages_to_languages(ocr_languages)
152 logger.warning(
153 "The ocr_languages kwarg will be deprecated in a future version of unstructured. "
154 "Please use languages instead.",
155 )
--> 157 return partition_pdf_or_image(
158 filename=filename,
159 file=file,
160 include_page_breaks=include_page_breaks,
161 strategy=strategy,
162 infer_table_structure=infer_table_structure,
163 languages=languages,
164 max_partition=max_partition,
165 min_partition=min_partition,
166 metadata_last_modified=metadata_last_modified,
167 **kwargs,
168 )
File ~/venv/small-project/lib/python3.8/site-packages/unstructured/partition/pdf.py:287, in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, max_partition, min_partition, metadata_last_modified, **kwargs)
285 with warnings.catch_warnings():
286 warnings.simplefilter("ignore")
--> 287 _layout_elements = _partition_pdf_or_image_local(
288 filename=filename,
289 file=spooled_to_bytes_io_if_needed(file),
290 is_image=is_image,
291 infer_table_structure=infer_table_structure,
292 include_page_breaks=include_page_breaks,
293 languages=languages,
294 metadata_last_modified=metadata_last_modified or last_modification_date,
295 **kwargs,
296 )
297 layout_elements = []
298 for el in _layout_elements:
File ~/venv/small-project/lib/python3.8/site-packages/unstructured/utils.py:178, in requires_dependencies.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
169 if len(missing_deps) > 0:
170 raise ImportError(
171 f"Following dependencies are missing: {', '.join(missing_deps)}. "
172 + (
(...)
176 ),
177 )
--> 178 return func(*args, **kwargs)
File ~/venv/small-project/lib/python3.8/site-packages/unstructured/partition/pdf.py:377, in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_mode, model_name, metadata_last_modified, **kwargs)
373 process_with_model_kwargs[key] = value
375 if file is None:
376 # NOTE(christine): out_layout = extracted_layout + inferred_layout
--> 377 out_layout = process_file_with_model(
378 filename,
379 is_image=is_image,
380 extract_tables=infer_table_structure,
381 model_name=model_name,
382 pdf_image_dpi=pdf_image_dpi,
383 **process_with_model_kwargs,
384 )
385 if model_name.startswith("chipper"):
386 # NOTE(alan): We shouldn't do OCR with chipper
387 final_layout = out_layout
File ~/venv/small-project/lib/python3.8/site-packages/unstructured_inference/inference/layout.py:481, in process_file_with_model(filename, model_name, is_image, fixed_layouts, extract_tables, pdf_image_dpi, **kwargs)
469 def process_file_with_model(
470 filename: str,
471 model_name: Optional[str],
(...)
476 **kwargs,
477 ) -> DocumentLayout:
478 """Processes pdf file with name filename into a DocumentLayout by using a model identified by
479 model_name."""
--> 481 model = get_model(model_name, **kwargs)
482 if isinstance(model, UnstructuredObjectDetectionModel):
483 detection_model = model
File ~/venv/small-project/lib/python3.8/site-packages/unstructured_inference/models/base.py:55, in get_model(model_name, **kwargs)
53 elif model_name in YOLOX_MODEL_TYPES:
54 model = UnstructuredYoloXModel()
---> 55 initialize_params = {**YOLOX_MODEL_TYPES[model_name], **kwargs}
56 elif model_name in CHIPPER_MODEL_TYPES:
57 logger.warning(
58 "The Chipper model is currently in Beta and is not yet ready for production use. "
59 "You can reach out to the Unstructured engineering team in the Unstructured "
(...)
63 "zt-1x7cgo0pg-PTptXWylzPQF9xZolzCnwQ",
64 )
File ~/venv/small-project/lib/python3.8/site-packages/unstructured_inference/utils.py:44, in LazyDict.__getitem__(self, key)
42 evaluate = value.evaluate
43 args, kwargs = value.info
---> 44 value = evaluate(*args, **kwargs)
45 if self.cache:
46 self._raw_dict[key] = value
File ~/venv/small-project/lib/python3.8/site-packages/huggingface_hub/utils/_validators.py:118, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
115 if check_use_auth_token:
116 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 118 return fn(*args, **kwargs)
File ~/venv/small-project/lib/python3.8/site-packages/huggingface_hub/file_download.py:1291, in hf_hub_download(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, local_dir_use_symlinks, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout)
1285 raise LocalEntryNotFoundError(
1286 "Cannot find the requested files in the disk cache and"
1287 " outgoing traffic has been disabled. To enable hf.co look-ups"
1288 " and downloads online, set 'local_files_only' to False."
1289 )
1290 else:
-> 1291 raise LocalEntryNotFoundError(
1292 "Connection error, and we cannot find the requested files in"
1293 " the disk cache. Please try again or make sure your Internet"
1294 " connection is on."
1295 )
1297 # From now on, etag and commit_hash are not None.
1298 assert etag is not None, "etag must have been retrieved from server"
LocalEntryNotFoundError: Connection error, and we cannot find the requested files in the disk cache. Please try again or make sure your Internet connection is on.
the error,how to process?
Hi, did you solve the problem?
Also stuck here.
Hello, I got the same issues when infer_table_structure=True
:
LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.
Any solution to avoid this error?
This error from the huggingface_hub
Python library indicates that library is unable to connect to the HuggingFace API to download a model used by unstructured-inference
.
If you do not have outbound internet connectivity this can cause this error. Also, if the huggingface API is unresponsive for whatever reason, this can also produce this error.
If you have outbound connectivity, retrying several times, as odd as that seems, is reported to help. Perhaps under load the HF API will sometimes respond if you try repeatedly.
Hello, I got the same issues when
infer_table_structure=True
:LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.
Any solution to avoid this error?
@Auronus92 were you able to resolve this issue? I am having the same error.
Per @scanny 's comment, internet connectivity of the HF API being temporarily unavailable is likely the issue here. We'll can add a more informative error message to that effect though.