haystack-core-integrations
haystack-core-integrations copied to clipboard
bug: Weaviate cannot retrieve `_split_overlap` field generated by the `DocumentSplitter`
Describe the bug
Weavite cannot retrieve the _split_overlap field generated by the DocumentSplitter even if the field is specified in the schema.
To Reproduce see more details at: https://github.com/deepset-ai/haystack/discussions/8511
Describe your environment (please complete the following information):
- OS: macOS
- Haystack version: 2.6.1
- Integration version: 4.0.0
Minimal reproducible code example:
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
document_store = WeaviateDocumentStore(url="http://localhost:8080")
doc = Document(content = """This is a test. This is another test. This is a third test.
This is a fourth test. This is a fifth test. This is a sixth test.
This is a seventh test. This is an eighth test. This is a ninth test.
This is a tenth test. This is an eleventh test. This is a twelfth test.
This is a thirteenth test. This is a fourteenth test. This is a fifteenth test.""")
splitter = DocumentSplitter(split_length=3, split_overlap=2, split_by="word")
splitted_docs = splitter.run([doc])["documents"]
document_store.write_documents(splitted_docs)
print(document_store.filter_documents()[0])
Error:
Traceback (most recent call last):
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/grpc/query.py", line 798, in __call
res = await self._connection.grpc_stub.Search(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/grpc/aio/_call.py", line 327, in __await__
raise _create_rpc_error(
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for _split_overlap: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {grpc_message:"creating primitive value for _split_overlap: proto: invalid type: []interface {}", grpc_status:2, created_time:"2024-11-11T16:17:57.2080374+01:00"}"
>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/try.py", line 36, in <module>
print(document_store.filter_documents()[0])
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py", line 403, in filter_documents
return [self._to_document(doc) for doc in result]
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py", line 403, in <listcomp>
return [self._to_document(doc) for doc in result]
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/iterator.py", line 59, in __next__
res = self.__query.fetch_objects(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/syncify.py", line 23, in sync_method
return _EventLoopSingleton.get_instance().run_until_complete(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/event_loop.py", line 40, in run_until_complete
return fut.result()
File "/home/anakin87/.pyenv/versions/3.10.13/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/home/anakin87/.pyenv/versions/3.10.13/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/queries/fetch_objects/query.py", line 65, in fetch_objects
res = await self._query.get(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/grpc/query.py", line 805, in __call
raise WeaviateQueryError(str(e), "GRPC search") # pyright: ignore
weaviate.exceptions.WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for _split_overlap: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {grpc_message:"creating primitive value for _split_overlap: proto: invalid type: []interface {}", grpc_status:2, created_time:"2024-11-11T16:17:57.2080374+01:00"}"
>.
Weaviate code to reproduce the bug
import weaviate
client = weaviate.WeaviateClient(
connection_params=(weaviate.connect.base.ConnectionParams.from_url(url="http://localhost:8080", grpc_port=50051))
)
client.connect()
DOCUMENT_COLLECTION_PROPERTIES = [
{"name": "_original_id", "dataType": ["text"]},
{"name": "content", "dataType": ["text"]},
{"name": "dataframe", "dataType": ["text"]},
{"name": "blob_data", "dataType": ["blob"]},
{"name": "blob_mime_type", "dataType": ["text"]},
{"name": "score", "dataType": ["number"]},
# the following properties can be present or not. Weaviate shows the same behavior:
# documents are correctly written but not correctly returned
# {
# 'name': 'mylistofobjects', 'dataType': ['object[]'],
# 'nestedProperties': [
# {'dataType': ['text'], 'name': 'doc_id'},
# {'dataType': ['number[]'], 'name': 'range'}],
# }
]
collection_settings = {
"class": "Default",
"invertedIndexConfig": {"indexNullState": True},
"properties": DOCUMENT_COLLECTION_PROPERTIES,
}
collection = client.collections.create_from_dict(collection_settings)
properties = {
'content': 'This is a test document',
'dataframe': None,
'score': None,
'mylistofobjects': [{'doc_id': '1', 'range': [1, 2]}],
'_original_id': '3972bbfa2c09af05a7118ed4233124582a138dd83e3de1db3ff742f810df4c41',
}
collection.data.insert(
properties=properties,
vector=[0.1] * 300,
)
# this works and returns all properties except byte
# (in this case byte properties are not present, but they are not returned even if present)
it = collection.iterator(include_vector=True)
for i in it:
print(i)
# this fails
it = collection.iterator(include_vector=True, return_properties=["content", "mylistofobjects"])
for i in it:
print(i)
Error:
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/grpc/query.py", line 798, in __call
res = await self._connection.grpc_stub.Search(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/grpc/aio/_call.py", line 327, in __await__
raise _create_rpc_error(
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for mylistofobjects: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {created_time:"2024-11-11T17:27:11.360212037+01:00", grpc_status:2, grpc_message:"creating primitive value for mylistofobjects: proto: invalid type: []interface {}"}"
>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/newtryweaviate.py", line 55, in <module>
for i in it:
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/iterator.py", line 59, in __next__
res = self.__query.fetch_objects(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/syncify.py", line 23, in sync_method
return _EventLoopSingleton.get_instance().run_until_complete(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/event_loop.py", line 40, in run_until_complete
return fut.result()
File "/home/anakin87/.pyenv/versions/3.10.13/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/home/anakin87/.pyenv/versions/3.10.13/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/queries/fetch_objects/query.py", line 65, in fetch_objects
res = await self._query.get(
File "/home/anakin87/apps/haystack-core-integrations/integrations/weaviate/.hatch/weaviate-haystack/lib/python3.10/site-packages/weaviate/collections/grpc/query.py", line 805, in __call
raise WeaviateQueryError(str(e), "GRPC search") # pyright: ignore
weaviate.exceptions.WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for mylistofobjects: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {created_time:"2024-11-11T17:27:11.360212037+01:00", grpc_status:2, grpc_message:"creating primitive value for mylistofobjects: proto: invalid type: []interface {}"}"
>.```