ValueError: Query vector size 1024 does not match index column size 1536
{
"name": "ValueError",
"message": "Query vector size 1024 does not match index column size 1536",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[65], line 1
----> 1 result = await search_engine.asearch(\"Tell me about Agent Mercer\")
2 print(result.response)
File D:\\Documents\\graphrag\\graphrag\\query\\structured_search\\local_search\\search.py:66, in LocalSearch.asearch(self, query, conversation_history, **kwargs)
63 start_time = time.time()
64 search_prompt = \"\"
---> 66 context_text, context_records = self.context_builder.build_context(
67 query=query,
68 conversation_history=conversation_history,
69 **kwargs,
70 **self.context_builder_params,
71 )
72 log.info(\"GENERATE ANSWER: %s. QUERY: %s\", start_time, query)
73 try:
File D:\\Documents\\graphrag\\graphrag\\query\\structured_search\\local_search\\mixed_context.py:139, in LocalSearchMixedContext.build_context(self, query, conversation_history, include_entity_names, exclude_entity_names, conversation_history_max_turns, conversation_history_user_turns_only, max_tokens, text_unit_prop, community_prop, top_k_mapped_entities, top_k_relationships, include_community_rank, include_entity_rank, rank_description, include_relationship_weight, relationship_ranking_attribute, return_candidate_context, use_community_summary, min_community_rank, community_context_name, column_delimiter, **kwargs)
134 pre_user_questions = \"\
\".join(
135 conversation_history.get_user_turns(conversation_history_max_turns)
136 )
137 query = f\"{query}\
{pre_user_questions}\"
--> 139 selected_entities = map_query_to_entities(
140 query=query,
141 text_embedding_vectorstore=self.entity_text_embeddings,
142 text_embedder=self.text_embedder,
143 all_entities=list(self.entities.values()),
144 embedding_vectorstore_key=self.embedding_vectorstore_key,
145 include_entity_names=include_entity_names,
146 exclude_entity_names=exclude_entity_names,
147 k=top_k_mapped_entities,
148 oversample_scaler=2,
149 )
151 # build context
152 final_context = list[str]()
File D:\\Documents\\graphrag\\graphrag\\query\\context_builder\\entity_extraction.py:55, in map_query_to_entities(query, text_embedding_vectorstore, text_embedder, all_entities, embedding_vectorstore_key, include_entity_names, exclude_entity_names, k, oversample_scaler)
51 matched_entities = []
52 if query != \"\":
53 # get entities with highest semantic similarity to query
54 # oversample to account for excluded entities
---> 55 search_results = text_embedding_vectorstore.similarity_search_by_text(
56 text=query,
57 text_embedder=lambda t: text_embedder.embed(t),
58 k=k * oversample_scaler,
59 )
60 for result in search_results:
61 matched = get_entity_by_key(
62 entities=all_entities,
63 key=embedding_vectorstore_key,
64 value=result.document.id,
65 )
File D:\\Documents\\graphrag\\graphrag\\vector_stores\\lancedb.py:120, in LanceDBVectorStore.similarity_search_by_text(self, text, text_embedder, k, **kwargs)
118 query_embedding = text_embedder(text)
119 if query_embedding:
--> 120 return self.similarity_search_by_vector(query_embedding, k)
121 return []
File D:\\Documents\\graphrag\\graphrag\\vector_stores\\lancedb.py:99, in LanceDBVectorStore.similarity_search_by_vector(self, query_embedding, k, **kwargs)
89 docs = (
90 self.document_collection.search(query=query_embedding)
91 .where(self.query_filter, prefilter=True)
92 .limit(k)
93 .to_list()
94 )
95 else:
96 docs = (
97 self.document_collection.search(query=query_embedding)
98 .limit(k)
---> 99 .to_list()
100 )
101 return [
102 VectorStoreSearchResult(
103 document=VectorStoreDocument(
(...)
111 for doc in docs
112 ]
File e:\\usual_app\\anaconda\\envs\\GraphRAG\\Lib\\site-packages\\lancedb\\query.py:303, in LanceQueryBuilder.to_list(self)
295 def to_list(self) -> List[dict]:
296 \"\"\"
297 Execute the query and return the results as a list of dictionaries.
298
(...)
301 fields are returned whether or not they're explicitly selected.
302 \"\"\"
--> 303 return self.to_arrow().to_pylist()
File e:\\usual_app\\anaconda\\envs\\GraphRAG\\Lib\\site-packages\\lancedb\\query.py:528, in LanceVectorQueryBuilder.to_arrow(self)
519 def to_arrow(self) -> pa.Table:
520 \"\"\"
521 Execute the query and return the results as an
522 [Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
(...)
526 vector and the returned vectors.
527 \"\"\"
--> 528 return self.to_batches().read_all()
File e:\\usual_app\\anaconda\\envs\\GraphRAG\\Lib\\site-packages\\lancedb\\query.py:558, in LanceVectorQueryBuilder.to_batches(self, batch_size)
545 vector = [v.tolist() for v in vector]
546 query = Query(
547 vector=vector,
548 filter=self._where,
(...)
556 with_row_id=self._with_row_id,
557 )
--> 558 result_set = self._table._execute_query(query, batch_size)
559 if self._reranker is not None:
560 rs_table = result_set.read_all()
File e:\\usual_app\\anaconda\\envs\\GraphRAG\\Lib\\site-packages\\lancedb\\table.py:1623, in LanceTable._execute_query(self, query, batch_size)
1619 def _execute_query(
1620 self, query: Query, batch_size: Optional[int] = None
1621 ) -> pa.RecordBatchReader:
1622 ds = self.to_lance()
-> 1623 return ds.scanner(
1624 columns=query.columns,
1625 filter=query.filter,
1626 prefilter=query.prefilter,
1627 nearest={
1628 \"column\": query.vector_column,
1629 \"q\": query.vector,
1630 \"k\": query.k,
1631 \"metric\": query.metric,
1632 \"nprobes\": query.nprobes,
1633 \"refine_factor\": query.refine_factor,
1634 },
1635 with_row_id=query.with_row_id,
1636 batch_size=batch_size,
1637 ).to_reader()
File e:\\usual_app\\anaconda\\envs\\GraphRAG\\Lib\\site-packages\\lance\\dataset.py:336, in LanceDataset.scanner(self, columns, filter, limit, offset, nearest, batch_size, batch_readahead, fragment_readahead, scan_in_order, fragments, prefilter, with_row_id, use_stats)
320 builder = (
321 ScannerBuilder(self)
322 .columns(columns)
(...)
333 .use_stats(use_stats)
334 )
335 if nearest is not None:
--> 336 builder = builder.nearest(**nearest)
337 return builder.to_scanner()
File e:\\usual_app\\anaconda\\envs\\GraphRAG\\Lib\\site-packages\\lance\\dataset.py:2150, in ScannerBuilder.nearest(self, column, q, k, metric, nprobes, refine_factor, use_index)
2146 raise TypeError(
2147 f\"Query column {column} must be a vector. Got {column_field.type}.\"
2148 )
2149 if len(q) != column_type.list_size:
-> 2150 raise ValueError(
2151 f\"Query vector size {len(q)} does not match index column size\"
2152 f\" {column_type.list_size}\"
2153 )
2155 if k is not None and int(k) <= 0:
2156 raise ValueError(f\"Nearest-K must be > 0 but got {k}\")
ValueError: Query vector size 1024 does not match index column size 1536"
}
Hi @6643789wsx
Are you still having this issue? It makes sense since 1536 is default size for OpenAI text-embedding-3-small and text-embedding-ada-002. Are you using a different embedding model?
Hi @6643789wsx
Are you still having this issue? It makes sense since 1536 is default size for OpenAI
text-embedding-3-smallandtext-embedding-ada-002. Are you using a different embedding model?
https://github.com/microsoft/graphrag/issues/451 look like it happens on local embedding. I dont find where to change the size
Hi!
We are consolidating alternate model and local embedding issues here: https://github.com/microsoft/graphrag/issues/657
Hi @6643789wsx, @goodmaney and @AlonsoGuevara ,
I am also having the same issue when I runs the example_notebooks/local_search.ipynb example,
I trace the code a few hours and find the real reason:
- This issue is not a bug in neither graphrag core code or lancedb core code.
- It is not a hyperparameter issue: 1536 is default size for OpenAI embedding
- The only reason is in the local_search example, they loads the entity_embedding_df from a parquet where the embeddings in dim 1536 are already extracted from OpenAI API!
You have 2 ways to solve this issue:
- Use openAI 1536 as text embedder
- Regenerate entity_embedding_df embeddings using your custom embedding model like:
entity_embedding_df["description_embedding"] = entity_embedding_df["description"].apply(emb_func)
Hope this can help newcomers like myself.