onyx
onyx copied to clipboard
Indexing Failure: RecursionError
During a Jira indexing:
Traceback (most recent call last):
File "/app/danswer/background/indexing/run_indexing.py", line 219, in _run_indexing
new_docs, total_batch_chunks = indexing_pipeline(
^^^^^^^^^^^^^^^^^^
File "/app/danswer/indexing/indexing_pipeline.py", line 160, in index_doc_batch_with_handler
r = index_doc_batch(
^^^^^^^^^^^^^^^^
File "/app/danswer/utils/timing.py", line 31, in wrapped_func
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/app/danswer/indexing/indexing_pipeline.py", line 281, in index_doc_batch
all_chunks.extend(chunker.chunk(document=document))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/app/danswer/indexing/chunker.py", line 308, in chunk
normal_chunks = self._chunk_document(
^^^^^^^^^^^^^^^^^^^^^
File "/app/danswer/indexing/chunker.py", line 216, in _chunk_document
split_texts = self.chunk_splitter.split_text(section_text)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/sentence.py", line 170, in split_text
return self._split_text(text, chunk_size=self.chunk_size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/sentence.py", line 184, in _split_text
splits = self._split(text, chunk_size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/sentence.py", line 219, in _split
recursive_text_splits = self._split(
^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/sentence.py", line 219, in _split
recursive_text_splits = self._split(
^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/sentence.py", line 219, in _split
recursive_text_splits = self._split(
^^^^^^^^^^^^
[Previous line repeated 957 more times]
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/sentence.py", line 205, in _split
text_splits_by_fns, is_sentence = self._get_splits_by_fns(text)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/sentence.py", line 306, in _get_splits_by_fns
splits = split_fn(text)
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/llama_index/node_parser/text/utils.py", line 46, in split
spans = list(tokenizer.span_tokenize(text))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1329, in span_tokenize
for sentence in slices:
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1459, in _realign_boundaries
for sentence1, sentence2 in _pair_iter(slices):
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 321, in _pair_iter
prev = next(iterator)
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1432, in _slices_from_text
if self.text_contains_sentbreak(context):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1480, in text_contains_sentbreak
for tok in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1622, in _annotate_second_pass
for token1, token2 in _pair_iter(tokens):
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 321, in _pair_iter
prev = next(iterator)
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 603, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 572, in _tokenize_words
yield self._Token(tok, parastart=parastart, linestart=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 403, in __init__
self.type = self._get_type(tok)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 426, in _get_type
return self._RE_NUMERIC.sub("##number##", tok.lower())
^^^^^^^^^^^
RecursionError: maximum recursion depth exceeded while calling a Python object
Workaround is to set a higher recursion limit. Something like:
import sys
sys.setrecursionlimit(os.getenv("RECURSION_LIMIT",3000))
Fixed by #2689