localGPT
localGPT copied to clipboard
Ingest fails: Resource punkt not found.
Fresh git clone Macbook Pro M2 Cihp
Command: python ingest.py --device_type mps
2023-10-15 14:07:26,913 - INFO - ingest.py:121 - Loading documents from /Users/******/Desktop/localGPT2/localGPT/SOURCE_DOCUMENTS
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data] CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data] self signed certificate in certificate chain
[nltk_data] (_ssl.c:997)>
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/concurrent/futures/process.py", line 243, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/Users/******/Desktop/localGPT2/localGPT/ingest.py", line 40, in load_document_batch
data_list = [future.result() for future in futures]
File "/Users/******/Desktop/localGPT2/localGPT/ingest.py", line 40, in <listcomp>
data_list = [future.result() for future in futures]
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/concurrent/futures/_base.py", line 445, in result
return self.__get_result()
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/concurrent/futures/_base.py", line 390, in __get_result
raise self._exception
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/concurrent/futures/thread.py", line 52, in run
result = self.fn(*self.args, **self.kwargs)
File "/Users/******/Desktop/localGPT2/localGPT/ingest.py", line 30, in load_single_document
return loader.load()[0]
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/langchain/document_loaders/unstructured.py", line 86, in load
elements = self._get_elements()
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/langchain/document_loaders/unstructured.py", line 171, in _get_elements
return partition(filename=self.file_path, **self.unstructured_kwargs)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/auto.py", line 362, in partition
elements = _partition_pdf(
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/documents/elements.py", line 306, in wrapper
elements = func(*args, **kwargs)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/file_utils/filetype.py", line 551, in wrapper
elements = func(*args, **kwargs)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/chunking/title.py", line 277, in wrapper
elements = func(*args, **kwargs)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/pdf.py", line 157, in partition_pdf
return partition_pdf_or_image(
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/pdf.py", line 261, in partition_pdf_or_image
extracted_elements = extractable_elements(
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/pdf.py", line 180, in extractable_elements
return _partition_pdf_with_pdfminer(
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/utils.py", line 178, in wrapper
return func(*args, **kwargs)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/pdf.py", line 486, in _partition_pdf_with_pdfminer
elements = _process_pdfminer_pages(
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/pdf.py", line 584, in _process_pdfminer_pages
element = element_from_text(
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/text.py", line 308, in element_from_text
elif is_possible_narrative_text(text):
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/text_type.py", line 77, in is_possible_narrative_text
if exceeds_cap_ratio(text, threshold=cap_threshold):
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/text_type.py", line 274, in exceeds_cap_ratio
if sentence_count(text, 3) > 1:
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/partition/text_type.py", line 222, in sentence_count
sentences = sent_tokenize(text)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/unstructured/nlp/tokenize.py", line 30, in sent_tokenize
return _sent_tokenize(text)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/nltk/tokenize/__init__.py", line 106, in sent_tokenize
tokenizer = load(f"tokenizers/punkt/{language}.pickle")
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/nltk/data.py", line 750, in load
opened_resource = _open(resource_url)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/nltk/data.py", line 876, in _open
return find(path_, path + [""]).open()
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/nltk/data.py", line 583, in find
raise LookupError(resource_not_found)
LookupError:
**********************************************************************
Resource punkt not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('punkt')
For more information see: https://www.nltk.org/data.html
Attempted to load tokenizers/punkt/PY3/english.pickle
Searched in:
- '/Users/******/nltk_data'
- '/Users/******/anaconda3/envs/localGPT_llama2/nltk_data'
- '/Users/******/anaconda3/envs/localGPT_llama2/share/nltk_data'
- '/Users/******/anaconda3/envs/localGPT_llama2/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- ''
**********************************************************************
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/******/Desktop/localGPT2/localGPT/ingest.py", line 159, in <module>
main()
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/Users/******/Desktop/localGPT2/localGPT/ingest.py", line 122, in main
documents = load_documents(SOURCE_DIRECTORY)
File "/Users/******/Desktop/localGPT2/localGPT/ingest.py", line 71, in load_documents
contents, _ = future.result()
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/concurrent/futures/_base.py", line 438, in result
return self.__get_result()
File "/Users/******/anaconda3/envs/localGPT_llama2/lib/python3.10/concurrent/futures/_base.py", line 390, in __get_result
raise self._exception
LookupError:
**********************************************************************
Resource punkt not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('punkt')
For more information see: https://www.nltk.org/data.html
Attempted to load tokenizers/punkt/PY3/english.pickle
Searched in:
- '/Users/******/nltk_data'
- '/Users/******/anaconda3/envs/localGPT_llama2/nltk_data'
- '/Users/******/anaconda3/envs/localGPT_llama2/share/nltk_data'
- '/Users/******/anaconda3/envs/localGPT_llama2/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- ''
**********************************************************************
Same issue here. I haven't dug into the root cause, but a quick work around is to download the nltk files manually and then unzip it. This is was for Linux but should be similar on Mac.
python -m nltk.downloader all
cd ~/nltk_data/tokenizers
unzip punkt.zip
same issue on linux.