kafka-docker Unable to get cluster stable in swarm

I am able to run a stable cluster using docker-compose with

# Compose a collection of Docker containers used by Spacejam/Madlands server

# See README in this directory

# Makes a network so the docker image can use the hosts by name

# https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers
# https://docs.docker.com/compose/networking/
# https://docs.google.com/document/d/1isfM3HI-Rxbal9l_v2dyU6pl7CZMpQ_r2irkiMag2vE/edit#heading=h.krkqmakfnk6n

version: '3.2'

services:

  zookeeper:
    image: wurstmeister/zookeeper
    ports:
      - "2181:2181"
    networks:
      - madlands

  kafka-1:
    image: iggcanada/kafka
    ports:
      - target: 9094
        published: 9094
        protocol: tcp
        mode: host
    environment:
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL,OUTSIDE:PLAINTEXT
      KAFKA_LISTENERS: PLAINTEXT://kafka-1:9092,OUTSIDE://0.0.0.0:9094
      KAFKA_INTER_BROKER_PROTOCOL_VERSION: 0.11.0.0
      KAFKA_DELETE_TOPIC_ENABLE: "true"
      KAFKA_LOG_RETENTION_BYTES: -1
      KAFKA_LOG_RETENTION_DAYS: 2
      # Required because of bugs in Kafka 0.11.0.0
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    networks:
      - madlands

  kafka-2:
    image: iggcanada/kafka
    ports:
      - target: 9094
        published: 9095
        protocol: tcp
        mode: host
    environment:
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL,OUTSIDE:PLAINTEXT
      KAFKA_LISTENERS: PLAINTEXT://kafka-2:9092,OUTSIDE://0.0.0.0:9094
      KAFKA_INTER_BROKER_PROTOCOL_VERSION: 0.11.0.0
      KAFKA_DELETE_TOPIC_ENABLE: "true"
      KAFKA_LOG_RETENTION_BYTES: -1
      KAFKA_LOG_RETENTION_DAYS: 2
      # Required because of bugs in Kafka 0.11.0.0
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    networks:
      - madlands

  kafka-3:
    image: iggcanada/kafka
    ports:
      - target: 9094
        published: 9096
        protocol: tcp
        mode: host
    environment:
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL,OUTSIDE:PLAINTEXT
      KAFKA_LISTENERS: PLAINTEXT://kafka-3:9092,OUTSIDE://0.0.0.0:9094
      KAFKA_INTER_BROKER_PROTOCOL_VERSION: 0.11.0.0
      KAFKA_DELETE_TOPIC_ENABLE: "true"
      KAFKA_LOG_RETENTION_BYTES: -1
      KAFKA_LOG_RETENTION_DAYS: 2
      # Required because of bugs in Kafka 0.11.0.0
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    networks:
      - madlands

# Create our named network of type bridge
# For some reason could not get server to connect to Kafka using
# the default bridge network or host network. Not sure why only
# custom bridge network works? EK
networks:
  madlands:
    driver: bridge

This works find on my developer system. However, when I try to get an equivalent cluster running on AWS, the cluster never becomes stable. I am using docker stack . . . with

version: '3.2'

services:

  zookeeper:
    image: wurstmeister/zookeeper:latest
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.role == worker
    ports:
      - "2181:2181"

  kafka1:
    image: 003575935058.dkr.ecr.us-west-1.amazonaws.com/iggcanada/kafka
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.role == manager
    ports:
      - target: 9094
        published: 9094
        protocol: tcp
        mode: host
    environment:
      HOSTNAME_COMMAND: "docker info | grep ^Name: | cut -d' ' -f 2"
      KAFKA_ZOOKEEPER_CONNECT: kafka_zookeeper:2181
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL,OUTSIDE:PLAINTEXT
      KAFKA_LISTENERS: PLAINTEXT://kafka_kafka1:9092,OUTSIDE://0.0.0.0:9094
      KAFKA_INTER_BROKER_PROTOCOL_VERSION: 0.11.0.0
      KAFKA_DELETE_TOPIC_ENABLE: "true"
      KAFKA_LOG_RETENTION_BYTES: -1
      KAFKA_LOG_RETENTION_DAYS: 2
      # Required because of bugs in Kafka 0.11.0.0
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
                   
  kafka2:          
    image: 003575935058.dkr.ecr.us-west-1.amazonaws.com/iggcanada/kafka
    deploy:                                                            
      replicas: 1                                                      
      placement:                                                       
        constraints:                                                   
          - node.role == manager
    ports:                      
      - target: 9094            
        published: 9094         
        protocol: tcp           
        mode: host     
    environment:       
      HOSTNAME_COMMAND: "docker info | grep ^Name: | cut -d' ' -f 2"
      KAFKA_ZOOKEEPER_CONNECT: kafka_zookeeper:2181                       
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL,OUTSIDE:PLAINTEXT
      KAFKA_LISTENERS: PLAINTEXT://kafka_kafka2:9092,OUTSIDE://0.0.0.0:9094                                                                    
      KAFKA_INTER_BROKER_PROTOCOL_VERSION: 0.11.0.0                                                                                      
      KAFKA_DELETE_TOPIC_ENABLE: "true"                                                                                                  
      KAFKA_LOG_RETENTION_BYTES: -1                                                                                                      
      KAFKA_LOG_RETENTION_DAYS: 2                                    
      # Required because of bugs in Kafka 0.11.0.0 
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
    volumes:                                                                                                                                                                                                      
      - /var/run/docker.sock:/var/run/docker.sock                                                                                                                                                                 
                   
  kafka3:          
    image: 003575935058.dkr.ecr.us-west-1.amazonaws.com/iggcanada/kafka
    deploy:                                                            
      replicas: 1                                                      
      placement:                                                       
        constraints:                                                   
          - node.role == manager
    ports:                      
      - target: 9094            
        published: 9094         
        protocol: tcp           
        mode: host     
    environment:       
      HOSTNAME_COMMAND: "docker info | grep ^Name: | cut -d' ' -f 2"
      KAFKA_ZOOKEEPER_CONNECT: kafka_zookeeper:2181                       
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL,OUTSIDE:PLAINTEXT
      KAFKA_LISTENERS: PLAINTEXT://kafka_kafka3:9092,OUTSIDE://0.0.0.0:9094                                                                    
      KAFKA_INTER_BROKER_PROTOCOL_VERSION: 0.11.0.0                                                                                      
      KAFKA_DELETE_TOPIC_ENABLE: "true"                                                                                                  
      KAFKA_LOG_RETENTION_BYTES: -1                                                                                                      
      KAFKA_LOG_RETENTION_DAYS: 2                                    
      # Required because of bugs in Kafka 0.11.0.0 
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
    volumes:                                                                                                                                                                                                      
      - /var/run/docker.sock:/var/run/docker.sock

This stack comes up, but what happens is:

Two brokers seem to come up okay, but the last broker gets hung with WARN Connection to node xxxx could not be established. Broker may not be available.
Eventually after this container restarts a few time this broker seems to become stable. BUT, then another brokers starts failing with WARN Connection to node xxxx could not be established. Broker may not be available.
This pattern repeats indefinitely, and the cluster never becomes stable the way it does under docker-compose.

Any help would be greatly appreciated.

Aug 29 '17 22:08 kolotyluk

I have the same problem on my local machine 👍

Nov 22 '17 09:11 alighanem

I notice you added placement constraints when you transitioned to using stack deploy...And that they seem reversed (i.e. zookeeper on a worker node, all kafka instances on manager nodes). Do you just have one manager node in your docker cluster? Is it a relatively small EC2 instance? If you've overloaded that one Docker host with all of your kafka brokers, it's possible that they're all running slow enough that they don't always respond to interbroker requests before they time out, which Kafka would flag as a connection problem.

Mar 15 '18 16:03 CantankerousBullMoose

I have a similar issue, I posted this question on question on Stack Overflow.

Jan 15 '19 18:01 brunowego

kafka-docker kafka-docker copied to clipboard

Unable to get cluster stable in swarm

kafka-docker
kafka-docker copied to clipboard