llama_index
llama_index copied to clipboard
GPTSimpleVectorIndex has assert error
from gpt_index import GPTSimpleVectorIndex
index = GPTSimpleVectorIndex([])
If I only have these two lines for an empty list, it will still show the error below:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[1], line 2
1 from gpt_index import GPTSimpleVectorIndex
----> 2 index = GPTSimpleVectorIndex([])
File ~/work/venv310/lib/python3.10/site-packages/gpt_index/indices/vector_store/vector_indices.py:84, in GPTSimpleVectorIndex.__init__(self, documents, index_struct, text_qa_template, llm_predictor, embed_model, simple_vector_store_data_dict, **kwargs)
79 """Init params."""
80 vector_store = SimpleVectorStore(
81 simple_vector_store_data_dict=simple_vector_store_data_dict
82 )
---> 84 super().__init__(
85 documents=documents,
86 index_struct=index_struct,
87 text_qa_template=text_qa_template,
88 llm_predictor=llm_predictor,
89 embed_model=embed_model,
90 vector_store=vector_store,
91 **kwargs,
92 )
94 # TODO: Temporary hack to also store embeddings in index_struct
95 embedding_dict = vector_store._data.embedding_dict
File ~/work/venv310/lib/python3.10/site-packages/gpt_index/indices/vector_store/base.py:63, in GPTVectorStoreIndex.__init__(self, documents, index_struct, text_qa_template, llm_predictor, embed_model, vector_store, text_splitter, use_async, **kwargs)
61 self.text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
62 self._use_async = use_async
---> 63 super().__init__(
64 documents=documents,
65 index_struct=index_struct,
66 llm_predictor=llm_predictor,
67 embed_model=embed_model,
68 text_splitter=text_splitter,
69 **kwargs,
70 )
File ~/work/venv310/lib/python3.10/site-packages/gpt_index/indices/base.py:83, in BaseGPTIndex.__init__(self, documents, index_struct, llm_predictor, embed_model, docstore, index_registry, prompt_helper, text_splitter, chunk_size_limit, include_extra_info)
81 self._llm_predictor = llm_predictor or LLMPredictor()
82 # NOTE: the embed_model isn't used in all indices
---> 83 self._embed_model = embed_model or OpenAIEmbedding()
84 self._include_extra_info = include_extra_info
86 # TODO: move out of base if we need custom params per index
File ~/work/venv310/lib/python3.10/site-packages/gpt_index/embeddings/openai.py:208, in OpenAIEmbedding.__init__(self, mode, model, deployment_name)
201 def __init__(
202 self,
203 mode: str = OpenAIEmbeddingMode.TEXT_SEARCH_MODE,
204 model: str = OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002,
205 deployment_name: Optional[str] = None,
206 ) -> None:
207 """Init params."""
--> 208 super().__init__()
209 self.mode = OpenAIEmbeddingMode(mode)
210 self.model = OpenAIEmbeddingModelType(model)
File ~/work/venv310/lib/python3.10/site-packages/gpt_index/embeddings/base.py:55, in BaseEmbedding.__init__(self, embed_batch_size)
53 self._total_tokens_used = 0
54 self._last_token_usage: Optional[int] = None
---> 55 self._tokenizer: Callable = globals_helper.tokenizer
56 # list of tuples of id, text
57 self._text_queue: List[Tuple[str, str]] = []
File ~/work/venv310/lib/python3.10/site-packages/gpt_index/utils.py:38, in GlobalsHelper.tokenizer(self)
36 except ImportError:
37 raise ImportError(tiktoken_import_err)
---> 38 enc = tiktoken.get_encoding("gpt2")
39 self._tokenizer = cast(Callable[[str], List], enc.encode)
40 else:
File ~/work/venv310/lib/python3.10/site-packages/tiktoken/registry.py:63, in get_encoding(encoding_name)
60 raise ValueError(f"Unknown encoding {encoding_name}")
62 constructor = ENCODING_CONSTRUCTORS[encoding_name]
---> 63 enc = Encoding(**constructor())
64 ENCODINGS[encoding_name] = enc
65 return enc
File ~/work/venv310/lib/python3.10/site-packages/tiktoken_ext/openai_public.py:11, in gpt2()
10 def gpt2():
---> 11 mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
12 vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
13 encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
14 )
15 return {
16 "name": "gpt2",
17 "explicit_n_vocab": 50257,
(...)
20 "special_tokens": {"<|endoftext|>": 50256},
21 }
File ~/work/venv310/lib/python3.10/site-packages/tiktoken/load.py:95, in data_gym_to_mergeable_bpe_ranks(vocab_bpe_file, encoder_json_file)
93 encoder_json_loaded.pop(b"<|endoftext|>", None)
94 encoder_json_loaded.pop(b"<|startoftext|>", None)
---> 95 assert bpe_ranks == encoder_json_loaded
97 return bpe_ranks
AssertionError:
My tiktoken is like this:
pip install -U tiktoken
Requirement already satisfied: tiktoken in ./venv310/lib/python3.10/site-packages (0.3.1)
Requirement already satisfied: regex>=2022.1.18 in ./venv310/lib/python3.10/site-packages (from tiktoken) (2022.10.31)
Requirement already satisfied: requests>=2.26.0 in ./venv310/lib/python3.10/site-packages (from tiktoken) (2.28.2)
Requirement already satisfied: certifi>=2017.4.17 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (1.26.9)
Requirement already satisfied: idna<4,>=2.5 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (3.3)
Requirement already satisfied: charset-normalizer<4,>=2 in ./venv310/lib/python3.10/site-packages (from requests>=2.26.0->tiktoken) (2.0.12)
Hope anyone can help. Many thanks!
And I use Macbook air M1.
Problems solved, refer to https://github.com/openai/tiktoken/issues/63#issuecomment-1469536106. Have to delete the cached files inside this cache folder.