semaphore icon indicating copy to clipboard operation
semaphore copied to clipboard

Remote runner hangs

Open haxwithaxe opened this issue 2 years ago • 6 comments

Observed Behavior

The task hangs at

10:08:49 AM Task 2147483583 added to queue
10:09:03 AM Started: 2147483583
10:09:03 AM Run TaskRunner with template: swarm library 

I don't see any errors that are likely related in either the runner or server. Below are the server logs from the relevant time frame followed by the complete runner logs.

...
semaphore_server.1.9h68p1n6f7et@cricket    | time="2023-10-03T14:08:49Z" level=info msg="Task 2147483583 added to queue"
semaphore_server.1.9h68p1n6f7et@cricket    | time="2023-10-03T14:08:52Z" level=info msg="Set resource locker with TaskRunner 2147483583"
semaphore_server.1.9h68p1n6f7et@cricket    | time="2023-10-03T14:08:52Z" level=info msg="Task 2147483583 removed from queue"
semaphore_server.1.9h68p1n6f7et@cricket    | 
semaphore_server.1.9h68p1n6f7et@cricket    | time="2023-10-03T14:12:03Z" level=error msg="websocket: close sent" level=Error
semaphore_server.1.9h68p1n6f7et@cricket    | time="2023-10-03T14:12:03Z" level=error msg="close tcp 172.21.0.48:3000->172.21.0.60:33320: use of closed network connection" level=Error
Oct 03 10:08:06 runner systemd[1]: Started semaphore-runner.service - Semaphore Ansible Runner.
Oct 03 10:08:07 runner semaphore[42668]: Loading config
Oct 03 10:08:07 runner semaphore[42668]: Validating config
Oct 03 10:08:07 runner semaphore[42668]: time="2023-10-03T10:08:07-04:00" level=info msg="Trying to register on server"
Oct 03 10:08:07 runner semaphore[42668]: time="2023-10-03T10:08:07-04:00" level=info msg="Runner registered on server"

Expected Behavior

What I am expecting is for the task being run to execute as if it were running on the semaphore server itself just doing the execution on the runner instead. The same task template runs fine if I disable the remote runner in the server config.

Steps to Reproduce

  1. Install the server as a docker service.
  2. Install semaphore on a VM (Debian 12)
  3. Configure and start the runner (including server config).
  4. Setup all the bits and pieces on the server needed for a task template.
  5. Run the configured task template.

Extra info

  • I tried this with the current stable release (v2.9.37) and v2.9.39-beta with the server and runner having the same version.
  • I installed semaphore on the VM via the .deb package for both versions.
  • semaphore server config
{
 	"mysql": {
 		"host": "",
 		"user": "",
 		"pass": "",
 		"name": "",
 		"options": null
 	},
 	"bolt": {
 		"host": "/var/lib/semaphore/database.boltdb",
 		"user": "",
 		"pass": "",
 		"name": "",
 		"options": null
 	},
 	"postgres": {
 		"host": "",
 		"user": "",
 		"pass": "",
 		"name": "",
 		"options": null
 	},
 	"dialect": "bolt",
 	"port": "",
 	"interface": "",
 	"tmp_path": "/tmp/semaphore",
 	"ssh_config_path": "",
 	"git_client": "",
 	"web_host": "",
 	"cookie_hash": "-redacted-",
 	"cookie_encryption": "-redacted-",
 	"access_key_encryption": "-redacted-",
 	"email_alert": true,
 	"email_sender": "semaphore <-redacted->",
 	"email_host": "-redacted-",
 	"email_port": "-redacted-",
 	"email_username": "-redacted-",
 	"email_password": "'-redacted-",
 	"email_secure": true,
 	"ldap_enable": false,
 	"ldap_binddn": "",
 	"ldap_bindpassword": "",
 	"ldap_server": "",
 	"ldap_searchdn": "",
 	"ldap_searchfilter": "",
 	"ldap_mappings": {
 		"dn": "",
 		"mail": "",
 		"uid": "",
 		"cn": ""
 	},
 	"ldap_needtls": false,
 	"telegram_alert": false,
 	"telegram_chat": "",
 	"telegram_token": "",
 	"slack_alert": false,
 	"slack_url": "",
 	"oidc_providers": null,
 	"max_parallel_tasks": 0,
 	"runner_registration_token": "",
 	"password_login_disable": false,
 	"non_admin_can_create_project": false,
 	"use_remote_runner": true,
 	"runner": {
 		"api_url": "",
 		"registration_token": "",
 		"config_file": "",
 		"one_off": false,
 		"webhook": "",
 		"max_parallel_tasks": 0
 	},
 	"billing_enabled": false
 }
  • semaphore runner config
{
 	"mysql": {
 		"host": "",
 		"user": "",
 		"pass": "",
 		"name": "",
 		"options": null
 	},
 	"bolt": {
 		"host": "/home/semaphore/.cache/semaphore/database.boltdb",
 		"user": "",
 		"pass": "",
 		"name": "",
 		"options": null
 	},
 	"postgres": {
 		"host": "",
 		"user": "",
 		"pass": "",
 		"name": "",
 		"options": null
 	},
 	"dialect": "bolt",
 	"port": "",
 	"interface": "",
 	"tmp_path": "/tmp/semaphore",
 	"ssh_config_path": "",
 	"git_client": "",
 	"web_host": "",
 	"cookie_hash": "-redacted-",
 	"cookie_encryption": "-redacted-",
 	"access_key_encryption": "-redacted-",
 	"email_alert": false,
 	"email_sender": "",
 	"email_host": "",
 	"email_port": "",
 	"email_username": "",
 	"email_password": "",
 	"email_secure": false,
 	"ldap_enable": false,
 	"ldap_binddn": "",
 	"ldap_bindpassword": "",
 	"ldap_server": "",
 	"ldap_searchdn": "",
 	"ldap_searchfilter": "",
 	"ldap_mappings": {
 		"dn": "",
 		"mail": "",
 		"uid": "",
 		"cn": ""
 	},
 	"ldap_needtls": false,
 	"telegram_alert": false,
 	"telegram_chat": "",
 	"telegram_token": "",
 	"slack_alert": false,
 	"slack_url": "",
 	"oidc_providers": null,
 	"max_parallel_tasks": 0,
 	"runner_registration_token": "",
 	"password_login_disable": false,
 	"non_admin_can_create_project": false,
 	"use_remote_runner": false,
 	"runner": {
 		"api_url": "https://semaphore.local/api",
 		"registration_token": "-redacted-",
 		"config_file": "/home/semaphore/.config/semaphore-runner.json",
 		"one_off": false,
 		"webhook": "",
 		"max_parallel_tasks": 20
 	},
 	"billing_enabled": false
 }
  • semaphore-runner.service
[Unit]
Description=Semaphore Ansible Runner
Documentation=https://github.com/ansible-semaphore/semaphore
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=semaphore
Group=semaphore
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/bin/semaphore runner --config=/home/semaphore/.config/semaphore-runner.json
SyslogIdentifier=semaphore
Restart=always
RestartSec=10s

[Install]
WantedBy=multi-user.target
  • I also tried v2.9.39-beta with a docker runner with the same result. Here is the docker stack file I used. It's a jinja template and I left the things that I would have had to redact as they appear in the template. To make the runner section, I just copied and pasted the server section as the runner section, removed the ports section, and changed the image to the runner image.
version: "3.9"

configs:
  server_config:
    file: "./config.json"
  runner_config:
    file: "./runner.json"

services:

  server:
    image: semaphoreui/semaphore:v2.9.39-beta
    ports:
      - "33000:3000"
    networks:
      - swarm-net
    configs:
      - source: server_config
        target: /etc/semaphore/config.json
    volumes:
      # I condensed all the files into one directory on the server
      - server_data:/var/lib/semaphore
      - server_data:/home/semaphore
    environment:
      SEMAPHORE_DB_DIALECT: bolt
      SEMAPHORE_ADMIN_PASSWORD: "{{ semaphore_admin_password }}"
      SEMAPHORE_ADMIN_NAME: "{{ semaphore_admin_name }}"
      SEMAPHORE_ADMIN_EMAIL: "{{ semaphore_admin_email }}"
      SEMAPHORE_ADMIN: "{{ semaphore_admin_username }}"

  runner:
    image: semaphoreui/runner:v2.9.39-beta
    networks:
      - swarm-net
    configs:
      - source: runner_config
        target: /etc/semaphore/config.json
    volumes:
      - runner_data:/var/lib/semaphore
    environment:
      SEMAPHORE_DB_DIALECT: bolt
      SEMAPHORE_ADMIN_PASSWORD: "{{ semaphore_admin_password }}"
      SEMAPHORE_ADMIN_NAME: "{{ semaphore_admin_name }}"
      SEMAPHORE_ADMIN_EMAIL: "{{ semaphore_admin_email }}"
      SEMAPHORE_ADMIN: "{{ semaphore_admin_username }}"

networks:
  swarm-net:
    external: true

volumes:
  # These point to glusterfs volumes in reality but it's a mess of template variables that I didn't want to translate and it's not relevant
  server_data: {}
  runner_data: {}

haxwithaxe avatar Oct 03 '23 16:10 haxwithaxe

can confirm this, I too get a hang at running the task. I suspect its the main semaphore server that hangs, as a run using the remote runner errored as it couldnt get any data from the database

mhzawadi avatar Oct 05 '23 18:10 mhzawadi

I've created a custom image for both server and runner for simpler deployment into a Kubernetes cluster. In the process of testing and iterating on these, I've found that exiting an existing runner container seems to cause these hangs. The temporary solution I found is to delete the deployment and statefulset (basically remove the database and semaphore containers) and start from scratch.

My assumption is that the failed or exited runners are not being cleared out of the list of runners on the server end. I went poking around into the API docs and it looks like there's not a ton of info about how to interact with that part of the API.

mhill8304 avatar Nov 03 '23 19:11 mhill8304

For me it's a single runner on another server and the runner is still up it's just the task that hangs.

haxwithaxe avatar Nov 03 '23 20:11 haxwithaxe

I can confirm that problem and here is a temporary workaround:

1: empty runner sql table:

mariadb -u semaphore -p
use semaphore;
delete from runner;

2: remove runner.cfg (or whatever it is called in your config file)

3: re-register runner: semaphore runner --config ./config.json

This worked for me. Good Luck ! :)

thyseus avatar Nov 08 '23 09:11 thyseus

Somehow no runners are in the sql table. However, the logs in the runner says Runner registered on server How can we check where the runner is registered to.
Does runner need the DB connectivity in the config.json? I was of the opinion that the semaphore server will broker all the connections to DB including the ones from the runner.

Somehow the doc is severly lacking in this regard.

manju-rn avatar Dec 18 '23 01:12 manju-rn

Though I should leave this here for anyone that faces the same issue with me.

I faced the same issue with my deployment in a Kubernetes cluster some days ago where I have one deployment for Semaphore UI and one Stateful Set for the runners.

Like @thyseus mentioned deleting all records from database and restarting the runner solves the problem temporarily.

After investigating the issue with the multiple records in runner table I noticed that each Runner container after startup created a new config_file (as mentioned in the documentation) that contains the runner_id and the token causing it re-registering itself with the server and finally creating multiple records in runner table (I guess runner doesn't find the config and tries to re-registers itself to the server).

As a solution I configured my runner containers to place the config_file in a persistent volume so that the pod can find it on startup and avoid re-registering itself to the server causing the multiple records in runner table.

{
  "runner": {
    "registration_token": "***",
    "config_file": "path/to/the/file/where/runner/saves/service/information",  <-- THIS IN A PERSISTENT VOLUME
    "api_url": "http://<semaphore_host>:<semaphore_port>/api",
    "max_parallel_tasks": 10
  }
}

I guess the same solution can be applied with docker deployments (just place the runner config in a volume) to avoid multiple records in runner table after every restart of the a runner container.

dmitronat avatar Feb 14 '24 16:02 dmitronat

Hey @dmitronat

How did you get the start up config into your runner container? as I cant seem to make the runner work at all in docker.

mhzawadi avatar Mar 26 '24 11:03 mhzawadi

never mind I got it working, if you need any help the below are the configs I used.

compose file

---
version: '3.5'

# docker stack deploy --compose-file runner.yml semaphore

volumes:
  semaphore_config:

configs:
  runner_config:
     file: ./runner_config.conf

services:
  semaphore-runner:
    image: semaphoreui/runner:v2.9.56
    configs:
      - source: runner_config
        target: /etc/semaphore/config.json
    volumes:
      - semaphore_config:/etc/semaphore
    deploy:
      update_config:
        order: start-first
      mode: global
      labels:
        - traefik.enable=false

config.json

{
  "mysql":{
      "host":"mysql:3306",
      "user":"SECRETCODE",
      "pass":"SECRETCODE",
      "name":"SECRETCODE",
      "options":null
   },
   "bolt":{
      "host":"",
      "user":"",
      "pass":"",
      "name":"",
      "options":null
   },
   "postgres":{
      "host":"",
      "user":"",
      "pass":"",
      "name":"",
      "options":null
   },
   "dialect":"mysql",
 	"port": "",
 	"interface": "",
 	"tmp_path": "/tmp/semaphore",
 	"ssh_config_path": "",
 	"git_client": "",
 	"web_host": "",
  "cookie_hash":"SECRETCODE",
  "cookie_encryption":"SECRETCODE",
  "access_key_encryption":"SECRETCODE",
 	"email_alert": false,
 	"email_sender": "",
 	"email_host": "",
 	"email_port": "",
 	"email_username": "",
 	"email_password": "",
 	"email_secure": false,
 	"ldap_enable": false,
 	"ldap_binddn": "",
 	"ldap_bindpassword": "",
 	"ldap_server": "",
 	"ldap_searchdn": "",
 	"ldap_searchfilter": "",
 	"ldap_mappings": {
 		"dn": "",
 		"mail": "",
 		"uid": "",
 		"cn": ""
 	},
 	"ldap_needtls": false,
 	"telegram_alert": false,
 	"telegram_chat": "",
 	"telegram_token": "",
 	"slack_alert": false,
 	"slack_url": "",
 	"oidc_providers": null,
 	"max_parallel_tasks": 0,
 	"runner_registration_token": "",
 	"password_login_disable": false,
 	"non_admin_can_create_project": false,
 	"use_remote_runner": false,
 	"runner": {
 		"api_url": "https://semaphore/api",
 		"registration_token": "SECRETCODE",
 		"config_file": "/etc/semaphore/runner.json",
 		"one_off": false,
 		"webhook": "",
 		"max_parallel_tasks": 20
 	},
 	"billing_enabled": false
 }

mhzawadi avatar Mar 26 '24 22:03 mhzawadi