clearml-serving icon indicating copy to clipboard operation
clearml-serving copied to clipboard

[Bug] Run clearml serving with clearml server not working

Open david101-hunter opened this issue 3 months ago • 2 comments

Hi I built clearml server by self-hosted

I have followed official tutorial

My config:

CLEARML_WEB_HOST="http://localhost:8080"
CLEARML_API_HOST="http://localhost:8008"
CLEARML_FILES_HOST="http://localhost:8081"
CLEARML_API_ACCESS_KEY="J1WS8MDJMI9MMMA9EMH6"
CLEARML_API_SECRET_KEY="5eEYYHq3VMgf9DmQjfLtCiBBV09TgxImQmz8WDckKB0h6t3CIE"
CLEARML_SERVING_TASK_ID="2f5e0563914e41b9a1334f841c39852b"

because CLEARML_WEB_HOST on port 8080, I have to change 8080 to 8082 in docker-compose.yml file

  clearml-serving-inference:
    image: allegroai/clearml-serving-inference:latest
    container_name: clearml-serving-inference
    restart: unless-stopped
    # optimize perforamnce
    security_opt:
      - seccomp:unconfined
    ports:
      - "8082:8080"

full docker-compose.yml file. I have changed to same docker network with clearml server

version: "3"

services:
  zookeeper:
    image: bitnami/zookeeper:3.7.0
    container_name: clearml-serving-zookeeper
    # ports:
      # - "2181:2181"
    environment:
      - ALLOW_ANONYMOUS_LOGIN=yes
    networks:
      - clearml_backend
      - clearml_frontend    
      # - clearml-serving-backend

  kafka:
    image: bitnami/kafka:3.1.1
    container_name: clearml-serving-kafka
    # ports:
      # - "9092:9092"
    environment:
      - KAFKA_BROKER_ID=1
      - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
      - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
      - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181
      - ALLOW_PLAINTEXT_LISTENER=yes
      - KAFKA_CREATE_TOPICS="topic_test:1:1"
    depends_on:
      - zookeeper
    networks:
      - clearml_backend
      - clearml_frontend      
      # - clearml-serving-backend

  prometheus:
    image: prom/prometheus:v2.34.0
    container_name: clearml-serving-prometheus
    volumes:
      - ./prometheus.yml:/prometheus.yml
    command:
      - '--config.file=/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'
    restart: unless-stopped
    # ports:
      # - "9090:9090"
    depends_on:
      - clearml-serving-statistics
    networks:
      - clearml_backend
      - clearml_frontend    
      # - clearml-serving-backend

  alertmanager:
    image: prom/alertmanager:v0.23.0
    container_name: clearml-serving-alertmanager
    restart: unless-stopped
    # ports:
      # - "9093:9093"
    depends_on:
      - prometheus
      - grafana
    networks:
      - clearml_backend
      - clearml_frontend    
      # - clearml-serving-backend

  grafana:
    image: grafana/grafana:8.4.4-ubuntu
    container_name: clearml-serving-grafana
    volumes:
      - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml'
    restart: unless-stopped
    ports:
      - "3000:3000"
    depends_on:
      - prometheus
    networks:
      - clearml_backend
      - clearml_frontend    
      # - clearml-serving-backend


  clearml-serving-inference:
    image: allegroai/clearml-serving-inference:latest
    container_name: clearml-serving-inference
    restart: unless-stopped
    # optimize perforamnce
    security_opt:
      - seccomp:unconfined
    ports:
      - "8082:8082"
    environment:
      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
      CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8082}
      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
      CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8082/serve}
      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
      CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-}
      CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
      CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
      CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
      AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
      AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
      AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-}
      GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-}
      AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-}
      AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-}
    depends_on:
      - kafka
    networks:
      - clearml_backend
      - clearml_frontend    
      # - clearml-serving-backend

  clearml-serving-statistics:
    image: allegroai/clearml-serving-statistics:latest
    container_name: clearml-serving-statistics
    restart: unless-stopped
    # optimize perforamnce
    security_opt:
      - seccomp:unconfined
    # ports:
      # - "9999:9999"
    environment:
      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
    depends_on:
      - kafka
    networks:
      - clearml_backend
      - clearml_frontend
      # - clearml-serving-backend


networks:
  # clearml-serving-backend:
  #   driver: bridge
  clearml_backend:
    external: true
  clearml_frontend:
    external: true

when I run

docker-compose --env-file example.env -f docker-compose.yml up

I see so many error like this: [Errno 111] Connection refused')': /auth.login

Detail:

broker clearml-serving-kafka:9092 (id: 1 rack: null) (kafka.server.BrokerToControllerRequestThread)
clearml-serving-statistics    | Retrying (Retry(total=238, connect=238, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8e89ad4f50>: Failed to establish a new connection: [Errno 111] Connection refused')': /auth.login
clearml-serving-alertmanager  | level=info ts=2024-03-26T09:22:22.539Z caller=cluster.go:696 component=cluster msg="gossip not settled" polls=0 before=0 now=1 elapsed=2.000371425s
clearml-serving-inference     | Retrying (Retry(total=238, connect=238, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fae7f2c7550>: Failed to establish a new connection: [Errno 111] Connection refused')': /auth.login
clearml-serving-grafana       | logger=context t=2024-03-26T09:22:23.1+0000 lvl=eror msg="Failed to look up user based on cookie" error="user token not found"
clearml-serving-grafana       | logger=context t=2024-03-26T09:22:23.1+0000 lvl=info msg="Request Completed" method=GET path=/api/live/ws status=401 remote_addr=172.26.0.1 time_ms=0 size=27 referer=
clearml-serving-prometheus    | ts=2024-03-26T09:22:25.806Z caller=compact.go:510 level=info component=tsdb msg="write block resulted in empty block" mint=1709200800000 maxt=1709208000000 duration=21.387885ms
clearml-serving-prometheus    | ts=2024-03-26T09:22:25.810Z caller=head.go:840 level=info component=tsdb msg="Head GC completed" duration=3.617054ms
clearml-serving-prometheus    | ts=2024-03-26T09:22:25.810Z caller=checkpoint.go:98 level=info component=tsdb msg="Creating checkpoint" from_segment=12 to_segment=13 mint=1709208000000
clearml-serving-prometheus    | ts=2024-03-26T09:22:25.877Z caller=head.go:1009 level=info component=tsdb msg="WAL checkpoint complete" first=12 last=13 duration=66.585592ms
clearml-serving-statistics    | Retrying (Retry(total=237, connect=237, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8e899814d0>: Failed to establish a new connection: [Errno 111] Connection refused')': /auth.login
clearml-serving-kafka         | [2024-03-26 09:22:26,811] INFO [Controller id=1] Processing automatic preferred replica leader election (kafka.controller.KafkaController)
clearml-serving-kafka         | [2024-03-26 09:22:26,812] TRACE [Controller id=1] Checking need to trigger auto leader balancing (kafka.controller.KafkaController)
clearml-serving-inference     | Retrying (Retry(total=237, connect=237, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fae7f2d8850>: Failed to establish a new connection: [Errno 111] Connection refused')': /auth.login
clearml-serving-alertmanager  | level=info ts=2024-03-26T09:22:30.541Z caller=cluster.go:688 component=cluster msg="gossip settled; proceeding" elapsed=10.002840409s
clearml-serving-statistics    | Retrying (Retry(total=236, connect=236, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8e899a4490>: Failed to establish a new connection: [Errno 111] Connection refused')': /auth.login
clearml-serving-inference     | Retrying (Retry(total=236, connect=236, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fae96ccfd10>: Failed to establish a new connection: [Errno 111] Connection refused')': /auth.login
clearml-serving-grafana       | logger=context t=2024-03-26T09:22:39.1+0000 lvl=eror msg="Failed to look up user based on cookie" error="user token not found"
clearml-serving-grafana       | logger=context t=2024-03-26T09:22:39.1+0000 lvl=info msg="Request Completed" method=GET path=/api/live/ws status=401 remote_addr=172.26.0.1 time_ms=0 size=27 referer=

What is that? How can I fix that?

Thanks for all!

david101-hunter avatar Mar 26 '24 09:03 david101-hunter