druid icon indicating copy to clipboard operation
druid copied to clipboard

Middle-Managers disappearing from the cluster

Open Fryuni opened this issue 3 years ago • 12 comments

Affected Version

24.0.0

Description

Cluster size:

  • 5 Coordinators
  • 5 Overlords
  • 20 Middle-Managers
  • 6 Historicals
  • 4 Brokers
  • 3 Routers

Configurations in use (possibly sensitive information redacted):

Middle-Manager
{
  "awt.toolkit": "sun.awt.X11.XToolkit",
  "druid.coordinator.loadqueuepeon.type": "http",
  "druid.discovery.k8s.clusterIdentifier": "druid",
  "druid.discovery.type": "k8s",
  "druid.emitter": "noop",
  "druid.emmiter.logging.logLevel": "warn",
  "druid.extensions.loadList": "[ \"druid-kubernetes-extensions\", \"druid-basic-security\", \"prometheus-emitter\", \"postgresql-metadata-storage\", \"druid-google-extensions\", \"druid-pac4j\", \"druid-kafka-indexing-service\", \"druid-lookups-cached-global\", \"druid-virtual-columns\", \"druid-avro-extensions\", \"druid-datasketches\", \"druid-moving-average-query\", \"druid-time-min-max\" ]",
  "druid.generic.useDefaultValueForNull": "true",
  "druid.google.bucket": "***-druid-deepstorage",
  "druid.host": "10.28.8.24",
  "druid.indexer.fork.property.druid.processing.buffer.sizeBytes": "100MiB",
  "druid.indexer.fork.property.druid.processing.numMergeBuffers": "2",
  "druid.indexer.fork.property.druid.processing.numThreads": "1",
  "druid.indexer.logs.bucket": "***-druid-deepstorage",
  "druid.indexer.logs.kill.durationToRetain": "604800000",
  "druid.indexer.logs.kill.enabled": "false",
  "druid.indexer.logs.prefix": "taskLogs",
  "druid.indexer.logs.type": "google",
  "druid.indexer.runner.javaOptsArray": "[\"-XX:MaxDirectMemorySize=3072m\",\"-Xmx1024m\",\"-Xms128m\"]",
  "druid.indexer.runner.type": "httpRemote",
  "druid.javascript.enabled": "false",
  "druid.lookup.enableLookupSyncOnStartup": "true",
  "druid.lookup.lookupTier": "broker",
  "druid.metadata.storage.connector.connectURI": "jdbc:postgresql://***.svc.cluster.local:5432/druid",
  "druid.metadata.storage.connector.createTables": "true",
  "druid.metadata.storage.type": "postgresql",
  "druid.port": "8088",
  "druid.processing.numMergeBuffers": "2",
  "druid.processing.numThreads": "1",
  "druid.query.groupBy.applyLimitPushDownToSegment": "true",
  "druid.query.groupBy.maxOnDiskStorage": "1073741824",
  "druid.realtime.cache.populateCache": "true",
  "druid.realtime.cache.useCache": "true",
  "druid.selectors.coordinator.serviceName": "druid/coordinator",
  "druid.selectors.indexing.serviceName": "druid/overlord",
  "druid.server.http.numThreads": "5",
  "druid.serverview.type": "http",
  "druid.service": "druid/middlemanager",
  "druid.storage.type": "google",
  "druid.worker.capacity": "1",
  "druid.zk.service.enabled": "false",
  "file.encoding": "UTF-8",
  "file.encoding.pkg": "sun.io",
  "file.separator": "/",
  "java.awt.graphicsenv": "sun.awt.X11GraphicsEnvironment",
  "java.awt.printerjob": "sun.print.PSPrinterJob",
  "java.class.path": "/tmp/conf/druid/cluster/_common:/tmp/conf/druid/cluster/data/middleManager:lib/zstd-jni-1.5.2-3.jar:lib/jackson-dataformat-smile-2.10.5.jar:lib/commons-beanutils-1.9.4.jar:lib/classmate-1.1.0.jar:lib/log4j-jul-2.18.0.jar:lib/javax.inject-1.jar:lib/avatica-metrics-1.17.0.jar:lib/avatica-server-1.17.0.jar:lib/aopalliance-1.0.jar:lib/aether-util-0.9.0.M2.jar:lib/jetty-servlet-9.4.48.v20220622.jar:lib/spymemcached-2.12.3.jar:lib/zookeeper-jute-3.5.9.jar:lib/guava-16.0.1.jar:lib/google-oauth-client-1.26.0.jar:lib/druid-core-24.0.0.jar:lib/validation-api-1.1.0.Final.jar:lib/janino-3.0.11.jar:lib/httpclient-4.5.13.jar:lib/accessors-smart-1.2.jar:lib/async-http-client-2.5.3.jar:lib/joda-time-2.10.5.jar:lib/json-smart-2.3.jar:lib/guice-servlet-4.1.0.jar:lib/vavr-match-0.10.2.jar:lib/curator-client-4.3.0.jar:lib/calcite-linq4j-1.21.0.jar:lib/commons-collections4-4.2.jar:lib/curator-recipes-4.3.0.jar:lib/netty-transport-native-epoll-4.1.68.Final-linux-x86_64.jar:lib/aggdesigner-algorithm-6.0.jar:lib/jna-4.5.1.jar:lib/jackson-jaxrs-json-provider-2.10.5.jar:lib/asm-9.3.jar:lib/netty-reactive-streams-2.0.0.jar:lib/metrics-core-4.0.0.jar:lib/commons-collections-3.2.2.jar:lib/guice-assistedinject-4.1.0.jar:lib/derby-10.14.2.0.jar:lib/fastutil-8.5.4.jar:lib/google-api-client-1.26.0.jar:lib/netty-buffer-4.1.68.Final.jar:lib/druid-sql-24.0.0.jar:lib/log4j-slf4j-impl-2.18.0.jar:lib/commons-compiler-3.0.11.jar:lib/caffeine-2.8.0.jar:lib/jackson-module-guice-2.10.5.jar:lib/jetty-security-9.4.48.v20220622.jar:lib/maven-repository-metadata-3.1.1.jar:lib/commons-io-2.11.0.jar:lib/jackson-jq-0.0.10.jar:lib/asm-commons-9.3.jar:lib/compress-lzf-1.0.4.jar:lib/commons-codec-1.13.jar:lib/protobuf-java-3.11.0.jar:lib/config-magic-0.9.jar:lib/airline-io-2.8.4.jar:lib/audience-annotations-0.5.0.jar:lib/fastutil-extra-8.5.4.jar:lib/txw2-2.3.1.jar:lib/jersey-core-1.19.4.jar:lib/avatica-core-1.17.0.jar:lib/netty-3.10.6.Final.jar:lib/fastutil-core-8.5.4.jar:lib/druid-indexing-hadoop-24.0.0.jar:lib/jetty-client-9.4.48.v20220622.jar:lib/druid-indexing-service-24.0.0.jar:lib/commons-compress-1.21.jar:lib/reactive-streams-1.0.2.jar:lib/maven-model-builder-3.1.1.jar:lib/javax.el-api-3.0.0.jar:lib/netty-resolver-dns-4.1.68.Final.jar:lib/hibernate-validator-5.2.5.Final.jar:lib/druid-aws-common-24.0.0.jar:lib/sigar-1.6.5.132.jar:lib/maven-settings-3.1.1.jar:lib/log4j-1.2-api-2.18.0.jar:lib/resilience4j-bulkhead-1.3.1.jar:lib/icu4j-55.1.jar:lib/jackson-jaxrs-base-2.10.5.jar:lib/stax-ex-1.8.jar:lib/aether-connector-okhttp-0.0.9.jar:lib/javax.activation-api-1.2.0.jar:lib/netty-codec-4.1.68.Final.jar:lib/calcite-core-1.21.0.jar:lib/netty-codec-http-4.1.68.Final.jar:lib/aws-java-sdk-sts-1.12.264.jar:lib/jersey-servlet-1.19.4.jar:lib/json-path-2.3.0.jar:lib/druid-server-24.0.0.jar:lib/jetty-proxy-9.4.48.v20220622.jar:lib/lz4-java-1.8.0.jar:lib/jaxb-runtime-2.3.1.jar:lib/maven-artifact-3.6.0.jar:lib/jackson-module-jaxb-annotations-2.10.5.jar:lib/antlr4-runtime-4.5.1.jar:lib/curator-framework-4.3.0.jar:lib/druid-console-24.0.0.jar:lib/datasketches-java-3.2.0.jar:lib/RoaringBitmap-0.9.0.jar:lib/guice-4.1.0.jar:lib/log4j-api-2.18.0.jar:lib/jakarta.inject-api-1.0.3.jar:lib/netty-codec-socks-4.1.68.Final.jar:lib/jackson-annotations-2.10.5.jar:lib/derbyclient-10.14.2.0.jar:lib/aether-connector-file-0.9.0.M2.jar:lib/aws-java-sdk-ec2-1.12.264.jar:lib/disruptor-3.3.6.jar:lib/datasketches-memory-2.0.0.jar:lib/wagon-provider-api-2.4.jar:lib/xz-1.8.jar:lib/jetty-util-ajax-9.4.48.v20220622.jar:lib/commons-dbcp2-2.0.1.jar:lib/netty-handler-proxy-4.1.68.Final.jar:lib/javax.activation-1.2.0.jar:lib/jcodings-1.0.43.jar:lib/cron-scheduler-0.1.jar:lib/jsr305-2.0.1.jar:lib/aws-java-sdk-kms-1.12.264.jar:lib/jakarta.xml.bind-api-2.3.2.jar:lib/FastInfoset-1.2.15.jar:lib/jackson-datatype-joda-2.10.5.jar:lib/jcl-over-slf4j-1.7.36.jar:lib/druid-services-24.0.0.jar:lib/jetty-server-9.4.48.v20220622.jar:lib/curator-x-discovery-4.3.0.jar:lib/jvm-attach-api-1.5.jar:lib/jackson-jaxrs-smile-provider-2.10.5.jar:lib/maven-settings-builder-3.1.1.jar:lib/javax.el-3.0.0.jar:lib/google-http-client-1.26.0.jar:lib/rhino-1.7.11.jar:lib/jdbi-2.63.1.jar:lib/extendedset-24.0.0.jar:lib/netty-handler-4.1.68.Final.jar:lib/error_prone_annotations-2.11.0.jar:lib/commons-lang-2.6.jar:lib/jetty-util-9.4.48.v20220622.jar:lib/aether-api-0.9.0.M2.jar:lib/maven-aether-provider-3.1.1.jar:lib/commons-lang3-3.8.1.jar:lib/plexus-interpolation-1.19.jar:lib/httpcore-4.4.11.jar:lib/commons-pool2-2.2.jar:lib/druid-hll-24.0.0.jar:lib/jetty-continuation-9.4.48.v20220622.jar:lib/joni-2.1.27.jar:lib/istack-commons-runtime-3.0.7.jar:lib/aws-java-sdk-s3-1.12.264.jar:lib/ion-java-1.0.2.jar:lib/netty-common-4.1.68.Final.jar:lib/netty-transport-4.1.68.Final.jar:lib/jetty-rewrite-9.4.48.v20220622.jar:lib/jersey-server-1.19.4.jar:lib/derbynet-10.14.2.0.jar:lib/jetty-servlets-9.4.48.v20220622.jar:lib/tesla-aether-0.0.5.jar:lib/jmespath-java-1.12.264.jar:lib/zookeeper-3.5.9.jar:lib/async-http-client-netty-utils-2.5.3.jar:lib/druid-processing-24.0.0.jar:lib/jetty-http-9.4.48.v20220622.jar:lib/slf4j-api-1.7.36.jar:lib/jackson-dataformat-cbor-2.10.5.jar:lib/jakarta.activation-api-1.2.1.jar:lib/aws-java-sdk-core-1.12.264.jar:lib/jaxb-api-2.3.1.jar:lib/j2objc-annotations-1.1.jar:lib/aether-spi-0.9.0.M2.jar:lib/jsr311-api-1.1.1.jar:lib/commons-math3-3.6.1.jar:lib/ipaddress-5.3.4.jar:lib/jersey-guice-1.19.4.jar:lib/airline-2.8.4.jar:lib/jackson-datatype-guava-2.10.5.jar:lib/commons-logging-1.1.1.jar:lib/asm-tree-9.3.jar:lib/netty-resolver-4.1.68.Final.jar:lib/jackson-core-2.10.5.jar:lib/commons-text-1.3.jar:lib/resilience4j-core-1.3.1.jar:lib/maven-model-3.1.1.jar:lib/netty-transport-native-unix-common-4.1.68.Final.jar:lib/checker-qual-2.5.7.jar:lib/okhttp-1.0.2.jar:lib/guice-multibindings-4.1.0.jar:lib/javax.servlet-api-3.1.0.jar:lib/jetty-io-9.4.48.v20220622.jar:lib/vavr-0.10.2.jar:lib/google-http-client-jackson2-1.26.0.jar:lib/netty-codec-dns-4.1.68.Final.jar:lib/opencsv-4.6.jar:lib/jackson-databind-2.10.5.1.jar:lib/aether-impl-0.9.0.M2.jar:lib/plexus-utils-3.0.24.jar:lib/jboss-logging-3.2.1.Final.jar:lib/esri-geometry-api-2.2.0.jar:lib/asm-analysis-9.3.jar:lib/druid-gcp-common-24.0.0.jar:lib/log4j-core-2.18.0.jar:lib/shims-0.9.0.jar:",
  "java.class.version": "52.0",
  "java.endorsed.dirs": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/endorsed",
  "java.ext.dirs": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/ext:/usr/java/packages/lib/ext",
  "java.home": "/usr/lib/jvm/java-8-openjdk-amd64/jre",
  "java.io.tmpdir": "/druid/data",
  "java.library.path": "/usr/java/packages/lib/amd64:/usr/lib/x86_64-linux-gnu/jni:/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:/usr/lib/jni:/lib:/usr/lib",
  "java.runtime.name": "OpenJDK Runtime Environment",
  "java.runtime.version": "1.8.0_275-8u275-b01-1~deb9u1-b01",
  "java.specification.name": "Java Platform API Specification",
  "java.specification.vendor": "Oracle Corporation",
  "java.specification.version": "1.8",
  "java.util.logging.manager": "org.apache.logging.log4j.jul.LogManager",
  "java.vendor": "Oracle Corporation",
  "java.vendor.url": "http://java.oracle.com/",
  "java.vendor.url.bug": "http://bugreport.sun.com/bugreport/",
  "java.version": "1.8.0_275",
  "java.vm.info": "mixed mode",
  "java.vm.name": "OpenJDK 64-Bit Server VM",
  "java.vm.specification.name": "Java Virtual Machine Specification",
  "java.vm.specification.vendor": "Oracle Corporation",
  "java.vm.specification.version": "1.8",
  "java.vm.vendor": "Oracle Corporation",
  "java.vm.version": "25.275-b01",
  "line.separator": "\n",
  "log4j.shutdownCallbackRegistry": "org.apache.druid.common.config.Log4jShutdown",
  "log4j.shutdownHookEnabled": "true",
  "log4j2.is.webapp": "false",
  "net.spy.log.LoggerImpl": "net.spy.memcached.compat.log.SLF4JLogger",
  "org.jboss.logging.provider": "slf4j",
  "os.arch": "amd64",
  "os.name": "Linux",
  "os.version": "5.4.202+",
  "path.separator": ":",
  "sun.arch.data.model": "64",
  "sun.boot.class.path": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/resources.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/rt.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/sunrsasign.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/jsse.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/jce.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/charsets.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/jfr.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/classes",
  "sun.boot.library.path": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64",
  "sun.cpu.endian": "little",
  "sun.cpu.isalist": "",
  "sun.io.unicode.encoding": "UnicodeLittle",
  "sun.java.command": "org.apache.druid.cli.Main server middleManager",
  "sun.java.launcher": "SUN_STANDARD",
  "sun.jnu.encoding": "UTF-8",
  "sun.management.compiler": "HotSpot 64-Bit Tiered Compilers",
  "sun.os.patch.level": "unknown",
  "user.dir": "/opt/druid",
  "user.home": "/root",
  "user.language": "en",
  "user.name": "root",
  "user.timezone": "UTC"
}
Coordinator/Overlord
{
  "awt.toolkit": "sun.awt.X11.XToolkit",
  "druid.coordinator.asOverlord.enabled": "true",
  "druid.coordinator.asOverlord.overlordService": "druid/overlord",
  "druid.coordinator.balancer.strategy": "cachingCost",
  "druid.coordinator.kill.durationToRetain": "P180D",
  "druid.coordinator.kill.on": "false",
  "druid.coordinator.loadqueuepeon.type": "http",
  "druid.coordinator.period": "PT30S",
  "druid.coordinator.period.indexingPeriod": "PT5M",
  "druid.coordinator.startDelay": "PT30S",
  "druid.discovery.k8s.clusterIdentifier": "druid",
  "druid.discovery.type": "k8s",
  "druid.emitter": "prometheus",
  "druid.emitter.prometheus.addServiceAsLabel": "true",
  "druid.emitter.prometheus.port": "9999",
  "druid.emitter.prometheus.strategy": "exporter",
  "druid.emmiter.logging.logLevel": "warn",
  "druid.extensions.loadList": "[ \"druid-kubernetes-extensions\", \"druid-basic-security\", \"prometheus-emitter\", \"postgresql-metadata-storage\", \"druid-google-extensions\", \"druid-pac4j\", \"druid-kafka-indexing-service\", \"druid-lookups-cached-global\", \"druid-virtual-columns\", \"druid-avro-extensions\", \"druid-datasketches\", \"druid-moving-average-query\", \"druid-time-min-max\" ]",
  "druid.generic.useDefaultValueForNull": "true",
  "druid.google.bucket": "***-druid-deepstorage",
  "druid.host": "10.28.27.21",
  "druid.indexer.logs.bucket": "***-druid-deepstorage",
  "druid.indexer.logs.kill.durationToRetain": "604800000",
  "druid.indexer.logs.kill.enabled": "false",
  "druid.indexer.logs.prefix": "taskLogs",
  "druid.indexer.logs.type": "google",
  "druid.indexer.queue.startDelay": "PT30S",
  "druid.indexer.runner.type": "httpRemote",
  "druid.indexer.storage.type": "metadata",
  "druid.javascript.enabled": "false",
  "druid.lookup.enableLookupSyncOnStartup": "true",
  "druid.lookup.lookupTier": "broker",
  "druid.metadata.storage.connector.connectURI": "jdbc:postgresql://***.svc.cluster.local:5432/druid",
  "druid.metadata.storage.connector.createTables": "true",
  "druid.metadata.storage.type": "postgresql",
  "druid.peon.mode": "remote",
  "druid.port": "8081",
  "druid.query.groupBy.applyLimitPushDownToSegment": "true",
  "druid.query.groupBy.maxOnDiskStorage": "1073741824",
  "druid.selectors.coordinator.serviceName": "druid/coordinator",
  "druid.selectors.indexing.serviceName": "druid/overlord",
  "druid.serverview.type": "http",
  "druid.service": "druid/coordinator",
  "druid.storage.type": "google",
  "druid.zk.service.enabled": "false",
  "file.encoding": "UTF-8",
  "file.encoding.pkg": "sun.io",
  "file.separator": "/",
  "java.awt.graphicsenv": "sun.awt.X11GraphicsEnvironment",
  "java.awt.printerjob": "sun.print.PSPrinterJob",
  "java.class.path": "/tmp/conf/druid/cluster/_common:/tmp/conf/druid/cluster/master/coordinator-overlord:lib/zstd-jni-1.5.2-3.jar:lib/jackson-dataformat-smile-2.10.5.jar:lib/commons-beanutils-1.9.4.jar:lib/classmate-1.1.0.jar:lib/log4j-jul-2.18.0.jar:lib/javax.inject-1.jar:lib/avatica-metrics-1.17.0.jar:lib/avatica-server-1.17.0.jar:lib/aopalliance-1.0.jar:lib/aether-util-0.9.0.M2.jar:lib/jetty-servlet-9.4.48.v20220622.jar:lib/spymemcached-2.12.3.jar:lib/zookeeper-jute-3.5.9.jar:lib/guava-16.0.1.jar:lib/google-oauth-client-1.26.0.jar:lib/druid-core-24.0.0.jar:lib/validation-api-1.1.0.Final.jar:lib/janino-3.0.11.jar:lib/httpclient-4.5.13.jar:lib/accessors-smart-1.2.jar:lib/async-http-client-2.5.3.jar:lib/joda-time-2.10.5.jar:lib/json-smart-2.3.jar:lib/guice-servlet-4.1.0.jar:lib/vavr-match-0.10.2.jar:lib/curator-client-4.3.0.jar:lib/calcite-linq4j-1.21.0.jar:lib/commons-collections4-4.2.jar:lib/curator-recipes-4.3.0.jar:lib/netty-transport-native-epoll-4.1.68.Final-linux-x86_64.jar:lib/aggdesigner-algorithm-6.0.jar:lib/jna-4.5.1.jar:lib/jackson-jaxrs-json-provider-2.10.5.jar:lib/asm-9.3.jar:lib/netty-reactive-streams-2.0.0.jar:lib/metrics-core-4.0.0.jar:lib/commons-collections-3.2.2.jar:lib/guice-assistedinject-4.1.0.jar:lib/derby-10.14.2.0.jar:lib/fastutil-8.5.4.jar:lib/google-api-client-1.26.0.jar:lib/netty-buffer-4.1.68.Final.jar:lib/druid-sql-24.0.0.jar:lib/log4j-slf4j-impl-2.18.0.jar:lib/commons-compiler-3.0.11.jar:lib/caffeine-2.8.0.jar:lib/jackson-module-guice-2.10.5.jar:lib/jetty-security-9.4.48.v20220622.jar:lib/maven-repository-metadata-3.1.1.jar:lib/commons-io-2.11.0.jar:lib/jackson-jq-0.0.10.jar:lib/asm-commons-9.3.jar:lib/compress-lzf-1.0.4.jar:lib/commons-codec-1.13.jar:lib/protobuf-java-3.11.0.jar:lib/config-magic-0.9.jar:lib/airline-io-2.8.4.jar:lib/audience-annotations-0.5.0.jar:lib/fastutil-extra-8.5.4.jar:lib/txw2-2.3.1.jar:lib/jersey-core-1.19.4.jar:lib/avatica-core-1.17.0.jar:lib/netty-3.10.6.Final.jar:lib/fastutil-core-8.5.4.jar:lib/druid-indexing-hadoop-24.0.0.jar:lib/jetty-client-9.4.48.v20220622.jar:lib/druid-indexing-service-24.0.0.jar:lib/commons-compress-1.21.jar:lib/reactive-streams-1.0.2.jar:lib/maven-model-builder-3.1.1.jar:lib/javax.el-api-3.0.0.jar:lib/netty-resolver-dns-4.1.68.Final.jar:lib/hibernate-validator-5.2.5.Final.jar:lib/druid-aws-common-24.0.0.jar:lib/sigar-1.6.5.132.jar:lib/maven-settings-3.1.1.jar:lib/log4j-1.2-api-2.18.0.jar:lib/resilience4j-bulkhead-1.3.1.jar:lib/icu4j-55.1.jar:lib/jackson-jaxrs-base-2.10.5.jar:lib/stax-ex-1.8.jar:lib/aether-connector-okhttp-0.0.9.jar:lib/javax.activation-api-1.2.0.jar:lib/netty-codec-4.1.68.Final.jar:lib/calcite-core-1.21.0.jar:lib/netty-codec-http-4.1.68.Final.jar:lib/aws-java-sdk-sts-1.12.264.jar:lib/jersey-servlet-1.19.4.jar:lib/json-path-2.3.0.jar:lib/druid-server-24.0.0.jar:lib/jetty-proxy-9.4.48.v20220622.jar:lib/lz4-java-1.8.0.jar:lib/jaxb-runtime-2.3.1.jar:lib/maven-artifact-3.6.0.jar:lib/jackson-module-jaxb-annotations-2.10.5.jar:lib/antlr4-runtime-4.5.1.jar:lib/curator-framework-4.3.0.jar:lib/druid-console-24.0.0.jar:lib/datasketches-java-3.2.0.jar:lib/RoaringBitmap-0.9.0.jar:lib/guice-4.1.0.jar:lib/log4j-api-2.18.0.jar:lib/jakarta.inject-api-1.0.3.jar:lib/netty-codec-socks-4.1.68.Final.jar:lib/jackson-annotations-2.10.5.jar:lib/derbyclient-10.14.2.0.jar:lib/aether-connector-file-0.9.0.M2.jar:lib/aws-java-sdk-ec2-1.12.264.jar:lib/disruptor-3.3.6.jar:lib/datasketches-memory-2.0.0.jar:lib/wagon-provider-api-2.4.jar:lib/xz-1.8.jar:lib/jetty-util-ajax-9.4.48.v20220622.jar:lib/commons-dbcp2-2.0.1.jar:lib/netty-handler-proxy-4.1.68.Final.jar:lib/javax.activation-1.2.0.jar:lib/jcodings-1.0.43.jar:lib/cron-scheduler-0.1.jar:lib/jsr305-2.0.1.jar:lib/aws-java-sdk-kms-1.12.264.jar:lib/jakarta.xml.bind-api-2.3.2.jar:lib/FastInfoset-1.2.15.jar:lib/jackson-datatype-joda-2.10.5.jar:lib/jcl-over-slf4j-1.7.36.jar:lib/druid-services-24.0.0.jar:lib/jetty-server-9.4.48.v20220622.jar:lib/curator-x-discovery-4.3.0.jar:lib/jvm-attach-api-1.5.jar:lib/jackson-jaxrs-smile-provider-2.10.5.jar:lib/maven-settings-builder-3.1.1.jar:lib/javax.el-3.0.0.jar:lib/google-http-client-1.26.0.jar:lib/rhino-1.7.11.jar:lib/jdbi-2.63.1.jar:lib/extendedset-24.0.0.jar:lib/netty-handler-4.1.68.Final.jar:lib/error_prone_annotations-2.11.0.jar:lib/commons-lang-2.6.jar:lib/jetty-util-9.4.48.v20220622.jar:lib/aether-api-0.9.0.M2.jar:lib/maven-aether-provider-3.1.1.jar:lib/commons-lang3-3.8.1.jar:lib/plexus-interpolation-1.19.jar:lib/httpcore-4.4.11.jar:lib/commons-pool2-2.2.jar:lib/druid-hll-24.0.0.jar:lib/jetty-continuation-9.4.48.v20220622.jar:lib/joni-2.1.27.jar:lib/istack-commons-runtime-3.0.7.jar:lib/aws-java-sdk-s3-1.12.264.jar:lib/ion-java-1.0.2.jar:lib/netty-common-4.1.68.Final.jar:lib/netty-transport-4.1.68.Final.jar:lib/jetty-rewrite-9.4.48.v20220622.jar:lib/jersey-server-1.19.4.jar:lib/derbynet-10.14.2.0.jar:lib/jetty-servlets-9.4.48.v20220622.jar:lib/tesla-aether-0.0.5.jar:lib/jmespath-java-1.12.264.jar:lib/zookeeper-3.5.9.jar:lib/async-http-client-netty-utils-2.5.3.jar:lib/druid-processing-24.0.0.jar:lib/jetty-http-9.4.48.v20220622.jar:lib/slf4j-api-1.7.36.jar:lib/jackson-dataformat-cbor-2.10.5.jar:lib/jakarta.activation-api-1.2.1.jar:lib/aws-java-sdk-core-1.12.264.jar:lib/jaxb-api-2.3.1.jar:lib/j2objc-annotations-1.1.jar:lib/aether-spi-0.9.0.M2.jar:lib/jsr311-api-1.1.1.jar:lib/commons-math3-3.6.1.jar:lib/ipaddress-5.3.4.jar:lib/jersey-guice-1.19.4.jar:lib/airline-2.8.4.jar:lib/jackson-datatype-guava-2.10.5.jar:lib/commons-logging-1.1.1.jar:lib/asm-tree-9.3.jar:lib/netty-resolver-4.1.68.Final.jar:lib/jackson-core-2.10.5.jar:lib/commons-text-1.3.jar:lib/resilience4j-core-1.3.1.jar:lib/maven-model-3.1.1.jar:lib/netty-transport-native-unix-common-4.1.68.Final.jar:lib/checker-qual-2.5.7.jar:lib/okhttp-1.0.2.jar:lib/guice-multibindings-4.1.0.jar:lib/javax.servlet-api-3.1.0.jar:lib/jetty-io-9.4.48.v20220622.jar:lib/vavr-0.10.2.jar:lib/google-http-client-jackson2-1.26.0.jar:lib/netty-codec-dns-4.1.68.Final.jar:lib/opencsv-4.6.jar:lib/jackson-databind-2.10.5.1.jar:lib/aether-impl-0.9.0.M2.jar:lib/plexus-utils-3.0.24.jar:lib/jboss-logging-3.2.1.Final.jar:lib/esri-geometry-api-2.2.0.jar:lib/asm-analysis-9.3.jar:lib/druid-gcp-common-24.0.0.jar:lib/log4j-core-2.18.0.jar:lib/shims-0.9.0.jar:",
  "java.class.version": "52.0",
  "java.endorsed.dirs": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/endorsed",
  "java.ext.dirs": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/ext:/usr/java/packages/lib/ext",
  "java.home": "/usr/lib/jvm/java-8-openjdk-amd64/jre",
  "java.io.tmpdir": "/druid/data",
  "java.library.path": "/usr/java/packages/lib/amd64:/usr/lib/x86_64-linux-gnu/jni:/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:/usr/lib/jni:/lib:/usr/lib",
  "java.runtime.name": "OpenJDK Runtime Environment",
  "java.runtime.version": "1.8.0_275-8u275-b01-1~deb9u1-b01",
  "java.specification.name": "Java Platform API Specification",
  "java.specification.vendor": "Oracle Corporation",
  "java.specification.version": "1.8",
  "java.util.logging.manager": "org.apache.logging.log4j.jul.LogManager",
  "java.vendor": "Oracle Corporation",
  "java.vendor.url": "http://java.oracle.com/",
  "java.vendor.url.bug": "http://bugreport.sun.com/bugreport/",
  "java.version": "1.8.0_275",
  "java.vm.info": "mixed mode",
  "java.vm.name": "OpenJDK 64-Bit Server VM",
  "java.vm.specification.name": "Java Virtual Machine Specification",
  "java.vm.specification.vendor": "Oracle Corporation",
  "java.vm.specification.version": "1.8",
  "java.vm.vendor": "Oracle Corporation",
  "java.vm.version": "25.275-b01",
  "line.separator": "\n",
  "log4j.shutdownCallbackRegistry": "org.apache.druid.common.config.Log4jShutdown",
  "log4j.shutdownHookEnabled": "true",
  "log4j2.is.webapp": "false",
  "net.spy.log.LoggerImpl": "net.spy.memcached.compat.log.SLF4JLogger",
  "org.jboss.logging.provider": "slf4j",
  "os.arch": "amd64",
  "os.name": "Linux",
  "os.version": "5.4.202+",
  "path.separator": ":",
  "sun.arch.data.model": "64",
  "sun.boot.class.path": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/resources.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/rt.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/sunrsasign.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/jsse.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/jce.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/charsets.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/jfr.jar:/usr/lib/jvm/java-8-openjdk-amd64/jre/classes",
  "sun.boot.library.path": "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64",
  "sun.cpu.endian": "little",
  "sun.cpu.isalist": "",
  "sun.io.unicode.encoding": "UnicodeLittle",
  "sun.java.command": "org.apache.druid.cli.Main server coordinator",
  "sun.java.launcher": "SUN_STANDARD",
  "sun.jnu.encoding": "UTF-8",
  "sun.management.compiler": "HotSpot 64-Bit Tiered Compilers",
  "sun.os.patch.level": "unknown",
  "user.dir": "/opt/druid",
  "user.home": "/root",
  "user.language": "en",
  "user.name": "root",
  "user.timezone": "UTC"
}

There are no error messages, but this is the log from the Middle-Manager and the Coordinator/Overlord leader. The other Middle-Managers have the same logs.

Middle-Manager
2022-11-08T17:56:41+0000 startup service middleManager
Setting druid.host=10.28.1.28 in /tmp/conf/druid/cluster/data/middleManager/runtime.properties
2022-11-08T17:56:49,272 WARN [main] org.apache.druid.k8s.discovery.K8sDiscoveryConfig - IF coordinator pods run in multiple namespaces, then you MUST provide coordinatorLeaderElectionConfigMapNamespace
2022-11-08T17:56:49,276 WARN [main] org.apache.druid.k8s.discovery.K8sDiscoveryConfig - IF overlord pods run in multiple namespaces, then you MUST provide overlordLeaderElectionConfigMapNamespace
2022-11-08T17:56:52,417 WARN [main] org.eclipse.jetty.server.handler.gzip.GzipHandler - minGzipSize of 0 is inefficient for short content, break even is size 23
2022-11-08T17:56:56,269 WARN [qtp546936087-66] org.apache.druid.indexing.common.config.TaskConfig - Batch processing mode argument value is null or not valid:[null], defaulting to[CLOSED_SEGMENTS] 
Coordinator/Overlord
2022-11-08T14:21:20+0000 startup service coordinator
Setting druid.host=10.28.27.21 in /tmp/conf/druid/cluster/master/coordinator-overlord/runtime.properties
2022-11-08T14:21:25,787 WARN [main] org.apache.druid.k8s.discovery.K8sDiscoveryConfig - IF coordinator pods run in multiple namespaces, then you MUST provide coordinatorLeaderElectionConfigMapNamespace
2022-11-08T14:21:25,792 WARN [main] org.apache.druid.k8s.discovery.K8sDiscoveryConfig - IF overlord pods run in multiple namespaces, then you MUST provide overlordLeaderElectionConfigMapNamespace
2022-11-08T14:21:28,176 WARN [main] org.eclipse.jetty.server.handler.gzip.GzipHandler - minGzipSize of 0 is inefficient for short content, break even is size 23
2022-11-08T14:21:29,694 WARN [K8sDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='10.28.4.25', bindOnHost=false, port=-1, plaintextPort=8088, enablePlaintextPort=true, tlsPort=-1, enableTlsPort=false}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='broker'}}}] discovered but doesn't have service[dataNodeService]. Ignored.
2022-11-08T14:21:29,695 WARN [K8sDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='10.28.8.20', bindOnHost=false, port=-1, plaintextPort=8088, enablePlaintextPort=true, tlsPort=-1, enableTlsPort=false}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='broker'}}}] discovered but doesn't have service[dataNodeService]. Ignored.
2022-11-08T14:21:29,695 WARN [K8sDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='10.28.15.3', bindOnHost=false, port=-1, plaintextPort=8088, enablePlaintextPort=true, tlsPort=-1, enableTlsPort=false}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='broker'}}}] discovered but doesn't have service[dataNodeService]. Ignored.
2022-11-08T14:21:29,695 WARN [K8sDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='10.28.2.24', bindOnHost=false, port=-1, plaintextPort=8088, enablePlaintextPort=true, tlsPort=-1, enableTlsPort=false}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='broker'}}}] discovered but doesn't have service[dataNodeService]. Ignored.
2022-11-08T14:21:29,696 WARN [K8sDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='10.28.0.12', bindOnHost=false, port=-1, plaintextPort=8088, enablePlaintextPort=true, tlsPort=-1, enableTlsPort=false}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='broker'}}}] discovered but doesn't have service[dataNodeService]. Ignored.

This seems to be the same symptom as this thread on the forum.

This error doesn't happen when using ZooKeeper for leader election, but using ZooKeeper brings our whole cluster down whenever something forcefully breaks the connection between the Coordinators and ZooKeeper due to #13167

Fryuni avatar Nov 09 '22 01:11 Fryuni

Related prints

There was supposed to be 20 middle_managers image We call the Middle Manager pods indexers image

If a task lucks out to be assigned to a Middle Manager, it completes fine. Otherwise, it stays there until either we restart Middle Manager for them to reappear or they timeout with this message: image

PS: The message is truncated on the JSON object for the task, not on the UI. I couldn't find the whole message anywhere.

Fryuni avatar Nov 09 '22 01:11 Fryuni

Just to be clear, you are saying this only happens when using the ZK-less discovery in druid-kubernetes-extensions, right? I'm not familiar with how that extension works, although if you are having a problem with ZK-based discovery too then that lack of familiarity isn't as important 🙂

gianm avatar Nov 21 '22 05:11 gianm

Yes, this only happens with ZK-less discovery.

PR #13295 fixed the problem with the ZK-based discovery, so we are using a custom image built from that PR and waiting for a release now that it was merged.

Fryuni avatar Nov 21 '22 11:11 Fryuni

Looks like the issue happens, because peon unannounces the middlemanager it runs on when it terminates and removes druidDiscoveryAnnouncement-cluster-identifier and druidDiscoveryAnnouncement-id-hash labels from the middlemanager pod.

Labels with no peons:

druidDiscoveryAnnouncement-cluster-identifier: dev
druidDiscoveryAnnouncement-id-hash: "1276105065"
druidDiscoveryAnnouncement-middleManager: "true"

Labels with peon:

druidDiscoveryAnnouncement-cluster-identifier: dev
druidDiscoveryAnnouncement-id-hash: "1276108075"
druidDiscoveryAnnouncement-middleManager: "true"
druidDiscoveryAnnouncement-peon: "true"

Labels after peon finishes:

druidDiscoveryAnnouncement-middleManager: "true"

Sh1ftry avatar Aug 28 '23 12:08 Sh1ftry

Hello, I am facing same issue where middle maangers are disappearing for every index_kafka task in kubernetes deployment of Druid, any fix or workaround found foe the issue? Please let me know.

ahemad-shaik avatar Nov 10 '23 01:11 ahemad-shaik

This issue basically makes the kubernetes druid extension broken and unusable.

jwitko avatar Jun 13 '24 17:06 jwitko

This issue has been marked as stale due to 280 days of inactivity. It will be closed in 4 weeks if no further activity occurs. If this issue is still relevant, please simply write any comment. Even if closed, you can still revive the issue at any time or discuss it on the [email protected] list. Thank you for your contributions.

github-actions[bot] avatar Mar 21 '25 00:03 github-actions[bot]

/fresh

applike-ss avatar Mar 21 '25 06:03 applike-ss

I'm having the exact same issue here with the latest version of Druid (v32.0.1). I have a Kafka job running and after the job completes successfully the MiddleManager disappears.

As Sh1ftry pointed out previously the Kubernetes extension seems to remove the Kubernetes labels from the MiddleManager after peon is completed.

Before:

Labels:           app=druid
                  apps.kubernetes.io/pod-index=0
                  component=middleManager
                  controller-revision-hash=druid-cluster-middlemanagers-7fcff47567
                  druidDiscoveryAnnouncement-cluster-identifier=cluster
                  druidDiscoveryAnnouncement-id-hash=837274055
                  druidDiscoveryAnnouncement-middleManager=true
                  druid_cr=cluster
                  nodeSpecUniqueStr=druid-cluster-middlemanagers
                  statefulset.kubernetes.io/pod-name=druid-cluster-middlemanagers-0
Annotations:      druidNodeInfo-middleManager:
                    {"druidNode":{"service":"druid/middleManager","host":"10.0.164.21","bindOnHost":false,"plaintextPort":8088,"port":-1,"tlsPort":-1,"enableP...

After Kafka job is completed successfully:

Labels:           app=druid
                  apps.kubernetes.io/pod-index=0
                  component=middleManager
                  controller-revision-hash=druid-cluster-middlemanagers-7fcff47567
                  druidDiscoveryAnnouncement-middleManager=true
                  druid_cr=cluster
                  nodeSpecUniqueStr=druid-cluster-middlemanagers
                  statefulset.kubernetes.io/pod-name=druid-cluster-middlemanagers-0
Annotations:      druidNodeInfo-middleManager:
                    {"druidNode":{"service":"druid/middleManager","host":"10.0.154.9","bindOnHost":false,"plaintextPort":8088,"port":-1,"tlsPort":-1,"enablePl...

This leads to errors in the cluster not being able to find the MiddleManager:

2025-04-15T11:56:43,927 ERROR [org.apache.druid.k8s.discovery.K8sDruidNodeDiscoveryProvider$NodeRoleWatchermiddleManager] org.apache.druid.discovery.BaseNodeRoleWatcher - Noticed disappearance of unknown druid node [http://10.0.154.9:8088] of role [middleManager].

The labels seem to get removed by the K8sDruidNodeAnnouncer class:

2025-04-15T11:52:25,333 INFO [task-runner-0-priority-0] org.apache.druid.k8s.discovery.K8sDruidNodeAnnouncer - Unannouncing DiscoveryDruidNode[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/middleManager', host='10.0.154.9', bindOnHost=false, port=-1, plaintextPort=8100, enablePlaintextPort=true, tlsPort=-1, enableTlsPort=false}, nodeRole='PEON', services={dataNodeService=DataNodeService{tier='_default_tier', maxSize=0, serverType=indexer-executor, priority=0}, lookupNodeService=LookupNodeService{lookupTier='__default'}}', startTime=2025-04-15T11:51:24.122Z}]

The jobs in the cluster which have finished successfully are still displayed as "Running".

Is there any fix planned for this as of now or any kind of configuration which can mitigate this issue? As pointed out previously this basically makes the whole Kubernetes extension still un-usuable.

Thanks a lot! :)

EDIT: In case anyone else is experiencing the same problem and is looking for a solution to get Druid working without Zookeeper, check out the druid-kubernetes-overlord-extensions extension - that extension uses Kubernetes Jobs as workers and hence doesn't experience that issue. I used it together with the Kubernetes operator and found a working example here: https://github.com/iunera/druid-cluster-config/tree/main/kubernetes/druid/druidcluster

com98 avatar Apr 15 '25 12:04 com98

Also running into this issue on v33.0.0.

@com98 , I was trying the druid-kubernetes-overlord-extensions but seems to not work while using google cloud storage for indexer logs. Can you share what you were using for log storage?

java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
        at org.apache.druid.indexing.overlord.DruidOverlord$1.becomeLeader(DruidOverlord.java:193) ~[druid-indexing-service-33.0.0.jar:33.0.0]
        at org.apache.druid.k8s.discovery.K8sDruidLeaderSelector.lambda$startLeaderElector$0(K8sDruidLeaderSelector.java:77) ~[?:?]
        at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539) [?:?]
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) [?:?]
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) [?:?]
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) [?:?]
        at java.base/java.lang.Thread.run(Thread.java:840) [?:?]
Caused by: java.lang.reflect.InvocationTargetException
        at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:?]
        at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) ~[?:?]
        at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:?]
        at java.base/java.lang.reflect.Method.invoke(Method.java:569) ~[?:?]
        at org.apache.druid.java.util.common.lifecycle.Lifecycle$AnnotationBasedHandler.start(Lifecycle.java:446) ~[druid-processing-33.0.0.jar:33.0.0]
        at org.apache.druid.java.util.common.lifecycle.Lifecycle.start(Lifecycle.java:341) ~[druid-processing-33.0.0.jar:33.0.0]
        at org.apache.druid.indexing.overlord.DruidOverlord$1.becomeLeader(DruidOverlord.java:190) ~[druid-indexing-service-33.0.0.jar:33.0.0]
        ... 6 more
Caused by: org.apache.commons.lang3.NotImplementedException: this druid.indexer.logs.type [class org.apache.druid.storage.google.GoogleTaskLogs] does not support managing task payloads yet. You will have to switch to using environment variables

j-laberge avatar Jun 04 '25 16:06 j-laberge

This is still current for us in v34.0.0.

applike-ss avatar Nov 12 '25 11:11 applike-ss

I'm having the same issue in v33.0.0

ddaka avatar Nov 13 '25 14:11 ddaka