ml-commons
ml-commons copied to clipboard
[BUG] Neural search: ArrayIndexOutOfBoundsException: Index 495884 out of bounds for length 1
What is the bug? A clear and concise description of the bug.
In Opensearch 2.12.0:
When using GPU and initiating concurrent requests using neural retrieval, an ArrayIndexOutOfBoundsException exception was encountered.
I'm not sure if it's a concurrency issue, but what I can know is that a single request is successful, and exceptions only occur when there are concurrent requests.
Number of concurrent requests: More than 5 times
Request:
GET irp_index_vec/_search
{
"size": 100,
"query": {
"bool": {
"filter": [
{
"bool": {
"must": [
{
"terms": {
"stat": [
1
]
}
}
]
}
}
],
"must": [
{
"neural": {
"embeddingCnVector1": {
"query_text": "some content",
"k": 100
}
}
}
]
}
}
}
Exception:
[2024-03-28T19:30:27,355][WARN ][r.suppressed ] [opensearch-cluster_manager] path: /irp_index_vec/_search, params: {typed_keys=true, index=irp_index_vec}
org.opensearch.action.search.SearchPhaseExecutionException: all shards failed
at org.opensearch.action.search.AbstractSearchAsyncAction.onPhaseFailure(AbstractSearchAsyncAction.java:722) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.search.AbstractSearchAsyncAction.executeNextPhase(AbstractSearchAsyncAction.java:379) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.search.FetchSearchPhase.moveToNextPhase(FetchSearchPhase.java:298) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.search.FetchSearchPhase.lambda$innerRun$1(FetchSearchPhase.java:138) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.search.CountedCollector.countDown(CountedCollector.java:66) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.search.CountedCollector.onFailure(CountedCollector.java:85) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.search.FetchSearchPhase$2.onFailure(FetchSearchPhase.java:257) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.ActionListenerResponseHandler.handleException(ActionListenerResponseHandler.java:75) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.handleException(SearchTransportService.java:766) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.TransportService$9.handleException(TransportService.java:1725) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.security.transport.SecurityInterceptor$RestoringTransportResponseHandler.handleException(SecurityInterceptor.java:404) [opensearch-security-2.12.0.0.jar:2.12.0.0]
at org.opensearch.transport.TransportService$ContextRestoreResponseHandler.handleException(TransportService.java:1511) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundHandler.lambda$handleException$5(InboundHandler.java:447) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.common.util.concurrent.OpenSearchExecutors$DirectExecutorService.execute(OpenSearchExecutors.java:343) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundHandler.handleException(InboundHandler.java:445) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundHandler.handlerResponseError(InboundHandler.java:437) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundHandler.messageReceived(InboundHandler.java:170) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundHandler.inboundMessage(InboundHandler.java:127) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.TcpTransport.inboundMessage(TcpTransport.java:770) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundPipeline.forwardFragments(InboundPipeline.java:175) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundPipeline.doHandleBytes(InboundPipeline.java:150) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.InboundPipeline.handleBytes(InboundPipeline.java:115) [opensearch-2.12.0.jar:2.12.0]
at org.opensearch.transport.netty4.Netty4MessageChannelHandler.channelRead(Netty4MessageChannelHandler.java:95) [transport-netty4-client-2.12.0.jar:2.12.0]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:280) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1475) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
at io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1338) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
at io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1387) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
at io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:529) [netty-codec-4.1.106.Final.jar:4.1.106.Final]
at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:468) [netty-codec-4.1.106.Final.jar:4.1.106.Final]
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:290) [netty-codec-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:689) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:652) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997) [netty-common-4.1.106.Final.jar:4.1.106.Final]
at java.base/java.lang.Thread.run(Thread.java:1583) [?:?]
Caused by: org.opensearch.OpenSearchException$3: Index 495884 out of bounds for length 1
at org.opensearch.OpenSearchException.guessRootCauses(OpenSearchException.java:708) ~[opensearch-core-2.12.0.jar:2.12.0]
at org.opensearch.action.search.AbstractSearchAsyncAction.executeNextPhase(AbstractSearchAsyncAction.java:377) [opensearch-2.12.0.jar:2.12.0]
... 49 more
Caused by: java.lang.ArrayIndexOutOfBoundsException: Index 495884 out of bounds for length 1
at org.apache.lucene.util.SparseFixedBitSet.get(SparseFixedBitSet.java:129) ~[lucene-core-9.9.2.jar:9.9.2 a2939784c4ca60bc28bf488b5479c02fc2e5e22c - 2024-01-25 09:51:09]
at org.opensearch.search.fetch.FetchPhase.findRootDocumentIfNested(FetchPhase.java:283) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.search.fetch.FetchPhase.prepareHitContext(FetchPhase.java:299) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.search.fetch.FetchPhase.execute(FetchPhase.java:172) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.search.SearchService.lambda$executeFetchPhase$3(SearchService.java:782) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:74) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.action.ActionRunnable$2.doRun(ActionRunnable.java:89) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:913) ~[opensearch-2.12.0.jar:2.12.0]
at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52) ~[opensearch-2.12.0.jar:2.12.0]
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144) ~[?:?]
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642) ~[?:?]
... 1 more
How can one reproduce the bug? Steps to reproduce the behavior:
- Create neural retrieval concurrent requests
- View cluster startup logs
- See the error logs
What is the expected behavior? A clear and concise description of what you expected to happen.
Neural Search is ok when used GPU.
What is your host/environment?
- OS: Linux CentOS 7.9
- Version: 2.12.0
- Plugins: ml_commons
Do you have any screenshots? If applicable, add screenshots to help explain your problem.
Do you have any additional context? Add any other context about the problem.
I'm not sure if it's a client issue, because I used Opensearch java client version is 2.11.1. But I guess it's not.
Does this not happen if you don't use a GPU? It doesn't seem related to GPU usage.
Does this not happen if you don't use a GPU? It doesn't seem related to GPU usage.
Yes, I have not encountered this exception while using the CPU.
@lihuimingxs Can you provide more details? What GPU are you using? What's your cluster setting like how many data nodes, ML nodes, are you using GPU instance as data nodes ?
@lihuimingxs Can you provide more details? What GPU are you using? What's your cluster setting like how many data nodes, ML nodes, are you using GPU instance as data nodes ?
@ylwu-amzn Sure! I have 6 data nodes (including 1 master node) and 2 ML nodes (GPUs). The ML nodes are solely utilized for vector computations and do not store any data.
Tips: Then, due to business requirements, I expanded the number of data nodes to 8 and ML nodes to 4, yet the issue persisted. Hence, I suspect the issue may not be closely related to the number of nodes. I hope my additional explanation provides helpful information.
ML nodes environment:
- OS: CentOS 7.3 64bit
- GPU: 1 *NVIDIA P100 / 1 * 16G
- CPU: 8vCPUs
- Memory: 64GiB
index:
PUT irp_cre_vec_20240425
{
"aliases": {
"irp_cre_vec": {}
},
"mappings": {
"_source": {
"excludes": [
"embeddingCnContent1",
"embeddingCnContent2"
]
},
"properties": {
"embeddingCnContent1": {
"type": "text",
"index": false
},
"embeddingCnContent2": {
"type": "text",
"index": false
},
"embeddingCnVector1": {
"type": "knn_vector",
"dimension": 1024,
"method": {
"engine": "faiss",
"space_type": "l2",
"name": "hnsw",
"parameters": {}
}
},
"embeddingCnVector2": {
"type": "knn_vector",
"dimension": 1024,
"method": {
"engine": "faiss",
"space_type": "l2",
"name": "hnsw",
"parameters": {}
}
},
"id": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"replication": {
"type": "DOCUMENT"
},
"mapping": {
"total_fields": {
"limit": "1000"
}
},
"search": {
"default_pipeline": "cre_v2_search_model_pipeline"
},
"number_of_shards": "1",
"max_result_window": "10000",
"default_pipeline": "cre_v2_convert_pipeline",
"knn": "true",
"number_of_replicas": "0"
}
}
}
opensearch.yml:
Note: The Opensearch version is 2.12.0
, and the /home/opensearch/opensearch-2.9.0
path in the configuration file indicates only the opensearch installation path, not the Opensearch version.
# ======================== OpenSearch Configuration =========================
#
# NOTE: OpenSearch comes with reasonable defaults for most settings.
# Before you set out to tweak and tune the configuration, make sure you
# understand what are you trying to accomplish and the consequences.
#
# The primary way of configuring a node is via this file. This template lists
# the most important settings you may want to configure for a production cluster.
#
# Please consult the documentation for further information on configuration options:
# https://www.opensearch.org
#
# ---------------------------------- Cluster -----------------------------------
#
# Use a descriptive name for your cluster:
#
#cluster.name: my-application
cluster.name: opensearch-cluster
#
# ------------------------------------ Node ------------------------------------
#
# Use a descriptive name for the node:
#
#node.name: node-1
node.name: opensearch-cluster_manager
node.roles: [ cluster_manager, data, ingest ]
#
# Add custom attributes to the node:
#
#node.attr.rack: r1
node.attr.crmTag: "vecPosition"
node.attr.ctsvec: "ctsvec"
#
# ----------------------------------- Paths ------------------------------------
#
# Path to directory where to store the data (separate multiple locations by comma):
#
#path.data: /path/to/data
#
# Path to log files:
#
#path.logs: /path/to/logs
#
# Path to snapshot files:
path.repo: ["/mnt/sfs_turbo"]
# ----------------------------------- Memory -----------------------------------
#
# Lock the memory on startup:
#
#bootstrap.memory_lock: true
#
# Make sure that the heap size is set to about half the memory available
# on the system and that the owner of the process is allowed to use this
# limit.
#
# OpenSearch performs poorly when the system is swapping the memory.
#
# ---------------------------------- Network -----------------------------------
#
# Set the bind address to a specific IP (IPv4 or IPv6):
#
#network.host: 192.168.0.1
network.host: 0.0.0.0
#
# Set a custom port for HTTP:
#
http.port: 9200
#
# For more information, consult the network module documentation.
#
# --------------------------------- Discovery ----------------------------------
#
# Pass an initial list of hosts to perform discovery when this node is started:
# The default list of hosts is ["127.0.0.1", "[::1]"]
#
#discovery.seed_hosts: ["host1", "host2"]
discovery.seed_hosts: ["192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11"]
#
# Bootstrap the cluster using an initial set of cluster-manager-eligible nodes:
#
#cluster.initial_cluster_manager_nodes: ["node-1", "node-2"]
cluster.initial_cluster_manager_nodes: ["opensearch-cluster_manager"]
#
# For more information, consult the discovery and cluster formation module documentation.
#
# ---------------------------------- Gateway -----------------------------------
#
# Block initial recovery after a full cluster restart until N nodes are started:
#
#gateway.recover_after_nodes: 3
#
# For more information, consult the gateway module documentation.
#
# ---------------------------------- Various -----------------------------------
#
# Require explicit names when deleting indices:
#
#action.destructive_requires_name: true
#
# ---------------------------------- Remote Store -----------------------------------
# Controls whether cluster imposes index creation only with remote store enabled
# cluster.remote_store.enabled: true
#
# Repository to use for segment upload while enforcing remote store for an index
# cluster.remote_store.repository: my-repo-1
#
# Controls whether cluster imposes index creation only with translog remote store enabled
# cluster.remote_store.translog.enabled: true
#
# Repository to use for translog upload while enforcing remote store for an index
# cluster.remote_store.translog.repository: my-repo-1
#
# ---------------------------------- Experimental Features -----------------------------------
#
# Gates the visibility of the experimental segment replication features until they are production ready.
#
#opensearch.experimental.feature.segment_replication_experimental.enabled: false
#
#
# Gates the visibility of the index setting that allows persisting data to remote store along with local disk.
# Once the feature is ready for production release, this feature flag can be removed.
#
#opensearch.experimental.feature.remote_store.enabled: false
#
#
# Gates the functionality of a new parameter to the snapshot restore API
# that allows for creation of a new index type that searches a snapshot
# directly in a remote repository without restoring all index data to disk
# ahead of time.
#
#opensearch.experimental.feature.searchable_snapshot.enabled: false
#
#
# Gates the functionality of enabling extensions to work with OpenSearch.
# This feature enables applications to extend features of OpenSearch outside of
# the core.
#
#opensearch.experimental.feature.extensions.enabled: false
#
#
# Gates the concurrent segment search feature. This feature enables concurrent segment search in a separate
# index searcher threadpool.
#
#opensearch.experimental.feature.concurrent_segment_search.enabled: false
######## Start OpenSearch Security Demo Configuration ########
# WARNING: revise all the lines below before you go into production
#plugins.security.ssl.transport.pemcert_filepath: esnode.pem
#plugins.security.ssl.transport.pemkey_filepath: esnode-key.pem
#plugins.security.ssl.transport.pemtrustedcas_filepath: root-ca.pem
#plugins.security.ssl.transport.enforce_hostname_verification: false
#plugins.security.ssl.http.enabled: false
#plugins.security.ssl.http.pemcert_filepath: esnode.pem
#plugins.security.ssl.http.pemkey_filepath: esnode-key.pem
#plugins.security.ssl.http.pemtrustedcas_filepath: root-ca.pem
#plugins.security.allow_unsafe_democertificates: true
#plugins.security.allow_default_init_securityindex: true
#plugins.security.authcz.admin_dn:
# - CN=,OU=client,O=client,L=test, C=de
plugins.security.disabled: false
plugins.security.ssl.transport.pemcert_filepath: /home/opensearch/opensearch-2.9.0/config/node1.pem
plugins.security.ssl.transport.pemkey_filepath: /home/opensearch/opensearch-2.9.0/config/node1-key.pem
plugins.security.ssl.transport.pemtrustedcas_filepath: /home/opensearch/opensearch-2.9.0/config/root-ca.pem
plugins.security.ssl.transport.enforce_hostname_verification: false
#plugins.security.ssl.http.enabled: true
plugins.security.ssl.http.enabled: false
plugins.security.ssl.http.pemcert_filepath: /home/opensearch/opensearch-2.9.0/config/node1.pem
plugins.security.ssl.http.pemkey_filepath: /home/opensearch/opensearch-2.9.0/config/node1-key.pem
plugins.security.ssl.http.pemtrustedcas_filepath: /home/opensearch/opensearch-2.9.0/config/root-ca.pem
plugins.security.allow_unsafe_democertificates: true
plugins.security.allow_default_init_securityindex: true
plugins.security.authcz.admin_dn:
- CN=admin,OU=Taa,O=Carrer,L=Beijing,ST=Beijing,C=CN
plugins.security.nodes_dn:
- CN=100.125.1.250,OU=Taa,O=Carrer,L=Beijing,ST=Beijing,C=CN
plugins.security.audit.type: internal_opensearch
plugins.security.enable_snapshot_restore_privilege: true
plugins.security.check_snapshot_restore_write_privileges: true
plugins.security.restapi.roles_enabled: ["all_access", "security_rest_api_access"]
plugins.security.system_indices.enabled: true
plugins.security.system_indices.indices: [".plugins-ml-config", ".plugins-ml-connector", ".plugins-ml-model-group", ".plugins-ml-model", ".plugins-ml-task", ".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opensearch-notifications-*", ".opensearch-notebooks", ".opensearch-observability", ".ql-datasources", ".opendistro-asynchronous-search-response*", ".replication-metadata-store", ".opensearch-knn-models"]
node.max_local_storage_nodes: 3
######## End OpenSearch Security Demo Configuration ########