llama_index icon indicating copy to clipboard operation
llama_index copied to clipboard

GPTSimpleVectorIndex has assert error

Open mobilestack opened this issue 2 years ago • 1 comments

from gpt_index import GPTSimpleVectorIndex
index = GPTSimpleVectorIndex([])

If I only have these two lines for an empty list, it will still show the error below:

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[1], line 2
      1 from gpt_index import GPTSimpleVectorIndex
----> 2 index = GPTSimpleVectorIndex([])

File ~/work/venv310/lib/python3.10/site-packages/gpt_index/indices/vector_store/vector_indices.py:84, in GPTSimpleVectorIndex.__init__(self, documents, index_struct, text_qa_template, llm_predictor, embed_model, simple_vector_store_data_dict, **kwargs)
     79 """Init params."""
     80 vector_store = SimpleVectorStore(
     81     simple_vector_store_data_dict=simple_vector_store_data_dict
     82 )
---> 84 super().__init__(
     85     documents=documents,
     86     index_struct=index_struct,
     87     text_qa_template=text_qa_template,
     88     llm_predictor=llm_predictor,
     89     embed_model=embed_model,
     90     vector_store=vector_store,
     91     **kwargs,
     92 )
     94 # TODO: Temporary hack to also store embeddings in index_struct
     95 embedding_dict = vector_store._data.embedding_dict

File ~/work/venv310/lib/python3.10/site-packages/gpt_index/indices/vector_store/base.py:63, in GPTVectorStoreIndex.__init__(self, documents, index_struct, text_qa_template, llm_predictor, embed_model, vector_store, text_splitter, use_async, **kwargs)
     61 self.text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
     62 self._use_async = use_async
---> 63 super().__init__(
     64     documents=documents,
     65     index_struct=index_struct,
     66     llm_predictor=llm_predictor,
     67     embed_model=embed_model,
     68     text_splitter=text_splitter,
     69     **kwargs,
     70 )

File ~/work/venv310/lib/python3.10/site-packages/gpt_index/indices/base.py:83, in BaseGPTIndex.__init__(self, documents, index_struct, llm_predictor, embed_model, docstore, index_registry, prompt_helper, text_splitter, chunk_size_limit, include_extra_info)
     81 self._llm_predictor = llm_predictor or LLMPredictor()
     82 # NOTE: the embed_model isn't used in all indices
---> 83 self._embed_model = embed_model or OpenAIEmbedding()
     84 self._include_extra_info = include_extra_info
     86 # TODO: move out of base if we need custom params per index

File ~/work/venv310/lib/python3.10/site-packages/gpt_index/embeddings/openai.py:208, in OpenAIEmbedding.__init__(self, mode, model, deployment_name)
    201 def __init__(
    202     self,
    203     mode: str = OpenAIEmbeddingMode.TEXT_SEARCH_MODE,
    204     model: str = OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002,
    205     deployment_name: Optional[str] = None,
    206 ) -> None:
    207     """Init params."""
--> 208     super().__init__()
    209     self.mode = OpenAIEmbeddingMode(mode)
    210     self.model = OpenAIEmbeddingModelType(model)

File ~/work/venv310/lib/python3.10/site-packages/gpt_index/embeddings/base.py:55, in BaseEmbedding.__init__(self, embed_batch_size)
     53 self._total_tokens_used = 0
     54 self._last_token_usage: Optional[int] = None
---> 55 self._tokenizer: Callable = globals_helper.tokenizer
     56 # list of tuples of id, text
     57 self._text_queue: List[Tuple[str, str]] = []

File ~/work/venv310/lib/python3.10/site-packages/gpt_index/utils.py:38, in GlobalsHelper.tokenizer(self)
     36     except ImportError:
     37         raise ImportError(tiktoken_import_err)
---> 38     enc = tiktoken.get_encoding("gpt2")
     39     self._tokenizer = cast(Callable[[str], List], enc.encode)
     40 else:

File ~/work/venv310/lib/python3.10/site-packages/tiktoken/registry.py:63, in get_encoding(encoding_name)
     60     raise ValueError(f"Unknown encoding {encoding_name}")
     62 constructor = ENCODING_CONSTRUCTORS[encoding_name]
---> 63 enc = Encoding(**constructor())
     64 ENCODINGS[encoding_name] = enc
     65 return enc

File ~/work/venv310/lib/python3.10/site-packages/tiktoken_ext/openai_public.py:11, in gpt2()
     10 def gpt2():
---> 11     mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
     12         vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
     13         encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
     14     )
     15     return {
     16         "name": "gpt2",
     17         "explicit_n_vocab": 50257,
   (...)
     20         "special_tokens": {"<|endoftext|>": 50256},
     21     }

File ~/work/venv310/lib/python3.10/site-packages/tiktoken/load.py:95, in data_gym_to_mergeable_bpe_ranks(vocab_bpe_file, encoder_json_file)
     93 encoder_json_loaded.pop(b"<|endoftext|>", None)
     94 encoder_json_loaded.pop(b"<|startoftext|>", None)
---> 95 assert bpe_ranks == encoder_json_loaded
     97 return bpe_ranks

AssertionError: 

My tiktoken is like this:

pip install -U tiktoken
Requirement already satisfied: tiktoken in ./venv310/lib/python3.10/site-packages (0.3.1)
Requirement already satisfied: regex>=2022.1.18 in ./venv310/lib/python3.10/site-packages (from tiktoken) (2022.10.31)
Requirement already satisfied: requests>=2.26.0 in ./venv310/lib/python3.10/site-packages (from tiktoken) (2.28.2)
Requirement already satisfied: certifi>=2017.4.17 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (1.26.9)
Requirement already satisfied: idna<4,>=2.5 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (3.3)
Requirement already satisfied: charset-normalizer<4,>=2 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (2.0.12)

Hope anyone can help. Many thanks!

mobilestack avatar Mar 14 '23 13:03 mobilestack

And I use Macbook air M1.

mobilestack avatar Mar 14 '23 13:03 mobilestack

Problems solved, refer to https://github.com/openai/tiktoken/issues/63#issuecomment-1469536106. Have to delete the cached files inside this cache folder.

mobilestack avatar Mar 15 '23 08:03 mobilestack