nomad-driver-containerd
nomad-driver-containerd copied to clipboard
Running with Nomad inside containerd
I'm interested in supporting this driver within the ResinStack distribution that I have developed for a more readily deployable version of the nomad ecosystem. In this environment I have nomad itself running as a containerd task, and I'm trying to work out either what needs to be mounted in, or if I can change the mount paths. Right now I'm hung up on this error and would appreciate advice:
2022-01-27T22:10:45-06:00 Driver Failure rpc error: code = Unknown desc = Error in creating container: failed to mount /tmp/containerd-mount2802059906: no such file or directory
/tmp from the host is available to the container, so I'm not really sure what's wrong here.
@the-maldridge Do you have a job spec?
Ask and ye shall receive:
job "proxy" {
name = "proxy"
datacenters = ["minicluster-control"]
type = "system"
group "traefik" {
network {
mode = "host"
port "http" { static = 80 }
port "metrics" { static = 8080 }
dns {
servers = ["127.0.0.1"]
}
}
service {
port = "http"
check {
type = "http"
path = "/ping"
port = "metrics"
address_mode = "host"
interval = "15s"
timeout = "2s"
}
connect {
native = true
}
}
task "traefik" {
driver = "containerd-driver"
config {
image = "traefik:v2.5.2"
args = [
"--accesslog=true",
"--api.dashboard",
"--api.insecure=true",
"--entrypoints.http.address=:80",
"--entrypoints.traefik.address=:8080",
"--metrics.prometheus",
"--pilot.dashboard=false",
"--ping=true",
"--providers.file.filename=/local/dynamic.toml",
"--providers.consulcatalog.connectaware=true",
"--providers.consulcatalog.connectbydefault=true",
"--providers.consulcatalog.servicename=proxy-traefik",
"--providers.consulcatalog.defaultrule=Host(`{{normalize .Name}}.mc`)",
"--providers.consulcatalog.exposedbydefault=false",
"--providers.consulcatalog.endpoint.address=127.0.0.1:8500",
]
}
template {
data=<<EOF
[http]
[http.routers]
[http.routers.nomad]
entryPoints = ["http"]
service = "nomad"
rule = "Host(`nomad.mc`)"
[http.routers.consul]
entryPoints = ["http"]
service = "consul"
rule = "Host(`consul.mc`)"
[http.routers.vault]
entryPoints = ["http"]
service = "vault"
rule = "Host(`vault.mc`)"
[http.services]
[http.services.nomad]
[http.services.nomad.loadBalancer]
[[http.services.nomad.loadBalancer.servers]]
url = "http://nomad.service.consul:4646"
[http.services.consul]
[http.services.consul.loadBalancer]
[[http.services.consul.loadBalancer.servers]]
url = "http://consul.service.consul:8500"
[http.services.vault]
[http.services.vault.loadBalancer]
[[http.services.vault.loadBalancer.servers]]
url = "http://active.vault.service.consul:8200"
EOF
destination = "local/dynamic.toml"
}
resources {
cpu = 500
memory = 64
}
}
}
}
@the-maldridge Works fine for me!
root@vagrant:~/go/src/github.com/Roblox/nomad-driver-containerd/example# nomad status
ID Type Priority Status Submit Date
proxy system 50 running 2022-01-28T06:37:10Z
Logs from Nomad
Jan 28 06:37:10 vagrant nomad[4654]: 2022-01-28T06:37:10.344Z [INFO] client.alloc_runner.task_runner.task_hook.logmon.nomad: opening fifo: alloc_id=bfaa8eaa-c9d9-13a1-34bd-4e246171ee89 task=traefik path=/tmp/nomad/alloc/bfaa8eaa-c9d9-13a1-34bd-4e246171ee89/alloc/logs/.traefik.stdout.fifo @module=logmon timestamp=2022-01-28T06:37:10.344Z
Jan 28 06:37:10 vagrant nomad[4654]: 2022-01-28T06:37:10.344Z [INFO] client.alloc_runner.task_runner.task_hook.logmon.nomad: opening fifo: alloc_id=bfaa8eaa-c9d9-13a1-34bd-4e246171ee89 task=traefik path=/tmp/nomad/alloc/bfaa8eaa-c9d9-13a1-34bd-4e246171ee89/alloc/logs/.traefik.stderr.fifo @module=logmon timestamp=2022-01-28T06:37:10.344Z
Jan 28 06:37:10 vagrant nomad[4654]: 2022/01/28 06:37:10.349050 [INFO] (runner) creating new runner (dry: false, once: false)
Jan 28 06:37:10 vagrant nomad[4654]: 2022/01/28 06:37:10.349589 [INFO] (runner) creating watcher
Jan 28 06:37:10 vagrant nomad[4654]: 2022/01/28 06:37:10.349908 [INFO] (runner) starting
Jan 28 06:37:10 vagrant nomad[4654]: 2022/01/28 06:37:10.351123 [INFO] (runner) rendered "(dynamic)" => "/tmp/nomad/alloc/bfaa8eaa-c9d9-13a1-34bd-4e246171ee89/traefik/local/dynamic.toml"
Jan 28 06:37:10 vagrant nomad[4654]: 2022-01-28T06:37:10.355Z [INFO] client.driver_mgr.containerd-driver: starting task: driver=containerd-driver @module=containerd-driver driver_cfg="{Image:traefik:v2.5.2 Command: Args:[--accesslog=true --api.dashboard --api.insecure=true --entrypoints.http.address=:80 --entrypoints.traefik.address=:8080 --metrics.prometheus --pilot.dashboard=false --ping=true --providers.file.filename=/local/dynamic.toml --providers.consulcatalog.connectaware=true --providers.consulcatalog.connectbydefault=true --providers.consulcatalog.servicename=proxy-traefik --providers.consulcatalog.defaultrule=Host(`{{normalize .Name}}.mc`) --providers.consulcatalog.exposedbydefault=false --providers.consulcatalog.endpoint.address=127.0.0.1:8500] CapAdd:[] CapDrop:[] Cwd: Devices:[] Seccomp:false SeccompProfile: ShmSize: Sysctl:map[] Privileged:false PidsLimit:0 PidMode: Hostname: HostDNS:false ImagePullTimeout:5m ExtraHosts:[] Entrypoint:[] ReadOnlyRootfs:false HostNetwork:false Auth:{Username: Password:} Mounts:[{Type:bind Target:/etc/resolv.conf Source:/tmp/nomad/alloc/bfaa8eaa-c9d9-13a1-34bd-4e246171ee89/traefik/resolv.conf Options:[bind ro]}]}" timestamp=2022-01-28T06:37:10.354Z
Jan 28 06:37:35 vagrant nomad[4654]: 2022-01-28T06:37:35.052Z [INFO] client.driver_mgr.containerd-driver: Successfully pulled docker.io/library/traefik:v2.5.2 image
Jan 28 06:37:35 vagrant nomad[4654]: : driver=containerd-driver @module=containerd-driver timestamp=2022-01-28T06:37:35.052Z
Jan 28 06:37:35 vagrant nomad[4654]: 2022-01-28T06:37:35.284Z [INFO] client.driver_mgr.containerd-driver: Successfully created container with name: traefik-bfaa8eaa-c9d9-13a1-34bd-4e246171ee89
Jan 28 06:37:35 vagrant nomad[4654]: : driver=containerd-driver @module=containerd-driver timestamp=2022-01-28T06:37:35.284Z
Jan 28 06:37:35 vagrant nomad[4654]: 2022-01-28T06:37:35.524Z [INFO] client.driver_mgr.containerd-driver: Successfully created task with ID: traefik-bfaa8eaa-c9d9-13a1-34bd-4e246171ee89
Jan 28 06:37:35 vagrant nomad[4654]: : driver=containerd-driver @module=containerd-driver timestamp=2022-01-28T06:37:35.523Z
Nomad alloc logs
root@vagrant:~/go/src/github.com/Roblox/nomad-driver-containerd/example# nomad alloc logs -f bfaa8eaa
time="2022-01-28T06:37:40Z" level=info msg="Configuration loaded from flags."
Yes I expect on an un-namespaced system it would. They key point of my question though is that nomad is itself running under containerd in an isolated mount namespace. I want to know what paths from the host I need to map for nomad to be able to use the containerd driver.
@the-maldridge I am not sure if I completely follow your question. When you say un-namespaced
system, are you talking about Nomad namespaces or Linux namespaces?
What do you mean by nomad is itself running under containerd
? Are you trying to run Nomad-in-Nomad
like DIND?
As in you have a Nomad server which launches a container (c1) using containerd-driver, and you want to run Nomad inside that container, c1?
Nomad (s1) ---> containerd-driver ----> c1 [Nomad (s2)]
That's fair, this is a slightly unorthodox environment and I haven't really explained it that well. In my environment I am using linuxkit/linuxkit to build my machine images, and this means that the init and supervision system at the OS layer is containerd. This means that nomad is itself a task being started and managed by containerd with filesystem isolation. What I want to do is use the containerd-driver to have nomad interact with the host containerd in much the same way that binding the docker socket into a container allows that container to start additional docker containers on the host.
So to recap, what I have is:
init-shim --> containerd --> nomad
And what I want to do is be able to do this:
init-shim --> containerd --> nomad
\-> my-nomad-alloc
To do this with a dockerd that's running adjacent to nomad I bind the following paths to Nomad's mount namespace:
"/etc/nomad:/etc/nomad",
"/etc/resolv.cluster:/etc/resolv.conf",
"/lib/modules:/lib/modules",
"/run:/run:rshared",
"/service:/service",
"/usr/bin/runsv:/usr/bin/runsv",
"/var/persist:/var/persist:rshared",
"/var/run:/var/run:rshared",
The important paths for the docker driver are /run
, /lib/modules
and /var/persist
(the nomad data_dir). It looks like the containerd driver wants to make use of /tmp
as well, and rather than playing whack-a-mole with paths I am hopeful there is a well understood set of paths that Nomad and containerd interact with each other through.
Hopefully that makes sense, but please don't hesitate to ask if there's more information I can provide.
@the-maldridge Why not run Nomad
as a container in host namespace?
This way your Nomad
(running as a containerd
container) will have access to host containerd
(init-system), and can register the containerd-driver
.
Hmm, my apologies as it seems I had not clearly communicated how this was configured.
Nomad is running as a containerd container already, and has access to the containerd on the host. However, like all containerd containers, there is default file system isolation which means there are going to be some directories that nomad needs to be able to share between its namespace and the namespace that the host containerd is going to want to bind in. Mostly this is the data directory that contains all the alloc subdirectories, but it seemed like the nomad driver wanted to use things in /tmp
which other drivers do not.
I can crawl the code if the answer to "what directories does the containerd driver need to use" is "we don't know" but I'd hoped for an easy answer to this problem.
I don't think containerd-driver
uses anything in /tmp
. The only host location containerd-driver
needs which I know of is /etc
since it needs to setup /etc/hosts
and /etc/resolv.conf
when setting up the container. There is nothing in /tmp
which the driver needs to setup a container.
2022-01-27T22:10:45-06:00 Driver Failure rpc error: code = Unknown desc = Error in creating container: failed to mount /tmp/containerd-mount2802059906: no such file or directory
The error you posted seems to be coming from containerd
when the driver is calling containerd to setup the container.
I think we need to figure out where containerd
is looking for that /tmp/containerd-mount2802059906
as it doesn't seem to be the host /tmp
. Most likely it's the container rootfs which is mounted somewhere on the host.
Most container supervisors (e.g. containerd) setup a block device (and a file system on top of that) which is mounted somewhere on the host, and when your container process is started, your container PID1 will be pivot_root
to that rootfs location, instead of host /
. That's how they achieve file system isolation.
I would put a sleep
in the driver when this error happens so that things don't get cleaned up right away. and try to look for that file /tmp/containerd-mount2802059906
and see what's the actual host path for this?
@the-maldridge did you make this working? with host network its working for me - I am sharing below directories with the host. BUT when I try with "bridge" network I get some weird errors.
Here is linuxkit yml for nomad
- name: nomad
image: gitea.abc.dev/platform/linuxkit/nomad:1.7.6
#command: ["/bin/nomad", "agent", "-dev", "-bind", "0.0.0.0", "-plugin-dir=/nomad/plugins", "-config=/etc/nomad"]
command: ["/bin/nomad", "agent", "-dev", "-config=/etc/nomad"]
capabilities:
- all
mounts:
- type: cgroup
options: ["rw","nosuid","noexec","nodev","relatime"]
binds:
- /var/lib/nomad:/var/lib/nomad
# - /etc/nomad:/etc/nomad # default config that is part of the container is good enough for now
- /sys/fs/cgroup:/sys/fs/cgroup:rw # without this nomad will fail on creating directory here
- /run/containerd:/run/containerd # This is needed to connect to conainerd socket
- /etc/resolv.conf:/etc/resolv.conf:ro # by default qemu user networking is not passing a dns server and dns is failing, its just a workaround for qemu, may be fixing later
- /etc/ssl:/etc/ssl # This will bring all CA certs to the container and thus fix SSL cert errors while pulling images
- /var/lib/containerd:/var/lib/containerd/
- /var/run/netns:/var/run/netns:rw
runtime:
mkdir: ["/var/lib/nomad", "/var/run/netns"]
This is my nomad config, otherwise I run with -dev
- i.e /bin/nomad agent -dev -config=/etc/nomad
:
data_dir = "/var/lib/nomad"
# data_dir to be shared with the host, so better move plugins to different one
plugin_dir = "/nomad/plugins"
bind_addr = "0.0.0.0"
plugin "containerd-driver" {
config {
enabled = false
containerd_runtime = "io.containerd.runc.v2"
stats_interval = "5s"
}
}
Here is my jobspec (if you just change bridge network to host (or without network), it just work fine, and I could see it create the redis container in containerd and redis work there.
BUT with below job spec I get error while starting container.
job "redis" {
datacenters = ["dc1"]
group "redis-group" {
network {
mode = "bridge"
port "server" { to = 6379 }
}
task "redis-task" {
driver = "containerd-driver"
config {
image = "docker.io/library/redis:alpine"
}
resources {
cpu = 500
memory = 256
}
}
}
}
With no network or with host network it just work and I can see containerd create a new containerd namespace called "nomad" and run the containers within that namespace.
[user@my-node nomad-jobs]$ nomad status redis
ID = redis
Name = redis
Submit Date = 2024-03-15T07:56:11-05:00
Type = service
Priority = 50
Datacenters = dc1
Namespace = default
Node Pool = default
Status = running
Periodic = false
Parameterized = false
Summary
Task Group Queued Starting Running Failed Complete Lost Unknown
redis-group 0 0 1 0 0 0 0
Latest Deployment
ID = 4cb44c75
Status = successful
Description = Deployment completed successfully
Deployed
Task Group Desired Placed Healthy Unhealthy Progress Deadline
redis-group 1 1 1 0 2024-03-15T13:06:24Z
Allocations
ID Node ID Task Group Version Desired Status Created Modified
3ec7a0f1 0e159920 redis-group 0 run running 27s ago 14s ago
Containerd has the container running
(ns: getty) linuxkit-9295049371a1:~# ctr -n nomad c ls
CONTAINER IMAGE RUNTIME
redis-task-1697f5b0-0cd8-93c5-cf43-cd21ec62d366 docker.io/library/redis:alpine io.containerd.runc.v2
BUT when I use above job spec with "bridge" network,
I can see nomad talk to CNI and get the bridge and get the IP from it and all, BUT when it try to create the container, it just errorout for some reason.
BTW in case of containerd, the nomad containerd driver does mostly everything like talking to CNI, getting IP, preparing the files for container etc, and containerd is just to run them which is different from docker.
here is nomad alloc status says - it shows the error Error in creating task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: can't get final child's PID from pipe: EOF: unknown
$ nomad status 80f7061e
ID = 80f7061e-a00a-219e-2f50-2dab36c7e254
Eval ID = 1ecf18ba
Name = redis.redis-group[0]
Node ID = 78dcdf8f
Node Name = linuxkit-9295049371a1
Job ID = redis
Job Version = 0
Client Status = failed
Client Description = Failed tasks
Desired Status = run
Desired Description = <none>
Created = 32s ago
Modified = 29s ago
Deployment ID = b42524a4
Deployment Health = unhealthy
Reschedule Eligibility = 29s from now
Task "redis-task" is "dead"
Task Resources:
CPU Memory Disk Addresses
500 MHz 256 MiB 300 MiB
Task Events:
Started At = N/A
Finished At = 2024-03-15T16:21:05Z
Total Restarts = 0
Last Restart = N/A
Recent Events:
Time Type Description
2024-03-15T11:21:05-05:00 Not Restarting Error was unrecoverable
2024-03-15T11:21:05-05:00 Driver Failure rpc error: code = Unknown desc = Error in creating task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: can't get final child's PID from pipe: EOF: unknown
2024-03-15T11:21:04-05:00 Task Setup Building Task Directory
2024-03-15T11:21:04-05:00 Received Task received by client
Below are from nomad logs
nomad get the request
2024-03-16T00:41:11.187Z [DEBUG] worker: dequeued evaluation: worker_id=1db92c8c-e55b-c558-4109-5a0926993be7 eval_id=cbababed-1caa-0fbd-79f9-8888ef380839 type=service namespace=default job_id=redis node_id="" triggered_by=alloc-failure
2024-03-16T00:41:11.187Z [DEBUG] worker.service_sched: reconciled current state with desired state: eval_id=cbababed-1caa-0fbd-79f9-8888ef380839 job_id=redis namespace=default worker_id=1db92c8c-e55b-c558-4109-5a0926993be7
results=
| Total changes: (place 1) (destructive 0) (inplace 0) (stop 1) (disconnect 0) (reconnect 0)
| Desired Changes for "redis-group": (place 1) (inplace 0) (destructive 0) (stop 1) (migrate 0) (ignore 0) (canary 0)
It talked to CNI and get the stuffs done and get the IP and other details back from it, and it prepared the direcotories etc.
2024-03-16T00:41:11.358Z [DEBUG] client.alloc_runner.runner_hook: received result from CNI: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 result="{\"Interfaces\":{\"eth0\":{\"IPConfigs\":[{\"IP\":\"172.26.64.4\",\"Gateway\":\"172.26.64.1\"}],\"Mac\":\"be:8a:19:d6:fa:07\",\"Sandbox\":\"/var/run/netns/211617fc-9bab-b136-8aaf-f9acf6a9cff8\"},\"nomad\":{\"IPConfigs\":null,\"Mac\":\"f2:1b:e2:4c:16:c9\",\"Sandbox\":\"\"},\"veth3b307013\":{\"IPConfigs\":null,\"Mac\":\"de:ce:d1:7f:0c:1b\",\"Sandbox\":\"\"}},\"DNS\":[{}],\"Routes\":[{\"dst\":\"0.0.0.0/0\"}]}"
2024-03-16T00:41:11.359Z [DEBUG] client.alloc_runner.task_runner: lifecycle start condition has been met, proceeding: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task
2024-03-16T00:41:11.359Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task type="Task Setup" msg="Building Task Directory" failed=false
2024-03-16T00:41:11.359Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: starting plugin: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task path=/bin/nomad args=["/bin/nomad", "logmon"]
2024-03-16T00:41:11.360Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: plugin started: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task path=/bin/nomad pid=415
2024-03-16T00:41:11.360Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: waiting for RPC address: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task plugin=/bin/nomad
2024-03-16T00:41:11.408Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon.nomad: plugin address: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task @module=logmon address=/tmp/plugin1164661790 network=unix timestamp=2024-03-16T00:41:11.408Z
2024-03-16T00:41:11.409Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: using plugin: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task version=2
2024-03-16T00:41:11.414Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon.nomad: opening fifo: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task path=/var/lib/nomad/alloc/211617fc-9bab-b136-8aaf-f9acf6a9cff8/alloc/logs/.redis-task.stdout.fifo @module=logmon timestamp=2024-03-16T00:41:11.414Z
2024-03-16T00:41:11.414Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon.nomad: opening fifo: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task @module=logmon path=/var/lib/nomad/alloc/211617fc-9bab-b136-8aaf-f9acf6a9cff8/alloc/logs/.redis-task.stderr.fifo timestamp=2024-03-16T00:41:11.414Z
Now the driver starting the container
2024-03-16T00:41:11.418Z [INFO] client.driver_mgr.containerd-driver: starting task: driver=containerd-driver @module=containerd-driver driver_cfg="{Image:docker.io/library/redis:alpine Command: Args:[] CapAdd:[] CapDrop:[] Cwd: Devices:[] Seccomp:false SeccompProfile: ShmSize: Sysctl:map[] Privileged:false PidsLimit:0 PidMode: Hostname: HostDNS:false ImagePullTimeout:5m ExtraHosts:[] Entrypoint:[] ReadOnlyRootfs:false HostNetwork:false Auth:{Username: Password:} Mounts:[{Type:bind Target:/etc/resolv.conf Source:/var/lib/nomad/alloc/211617fc-9bab-b136-8aaf-f9acf6a9cff8/redis-task/resolv.conf Options:[bind ro]}]}" timestamp=2024-03-16T00:41:11.417Z
2024-03-16T00:41:11.653Z [DEBUG] client: updated allocations: index=36 total=3 pulled=2 filtered=1
2024-03-16T00:41:11.654Z [DEBUG] client: allocation updates: added=0 removed=0 updated=2 ignored=1
2024-03-16T00:41:11.654Z [DEBUG] client: allocation updates applied: added=0 removed=0 updated=2 ignored=1 errors=0
2024-03-16T00:41:11.829Z [INFO] client.driver_mgr.containerd-driver: Successfully pulled docker.io/library/redis:alpine image
: driver=containerd-driver @module=containerd-driver timestamp=2024-03-16T00:41:11.829Z
2024-03-16T00:41:11.865Z [INFO] client.driver_mgr.containerd-driver: Successfully created container with name: redis-task-211617fc-9bab-b136-8aaf-f9acf6a9cff8
: driver=containerd-driver @module=containerd-driver timestamp=2024-03-16T00:41:11.864Z
BTW in containerd, setting up the container including starting the container (environment), and running the processes inside that container are different steps - starting the container environment is successful as seen above,
BUT starting the task (process) inside the container i.e actually starting it failed.
2024-03-16T00:41:12.013Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task type="Driver Failure" msg="rpc error: code = Unknown desc = Error in creating task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: can't get final child's PID from pipe: EOF: unknown" failed=false
2024-03-16T00:41:12.013Z [ERROR] client.alloc_runner.task_runner: running driver failed: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task error="rpc error: code = Unknown desc = Error in creating task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: can't get final child's PID from pipe: EOF: unknown"
2024-03-16T00:41:12.013Z [INFO] client.alloc_runner.task_runner: not restarting task: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task reason="Error was unrecoverable"
2024-03-16T00:41:12.013Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task type="Not Restarting" msg="Error was unrecoverable" failed=true
2024-03-16T00:41:12.112Z [DEBUG] http: request complete: method=GET path=/v1/agent/health?type=client duration="175.001µs"
2024-03-16T00:41:13.593Z [DEBUG] http: request complete: method=GET path=/v1/agent/health?type=server duration=3.957282ms
2024-03-16T00:41:14.016Z [WARN] client.alloc_runner.task_runner.task_hook.logmon.nomad: failed to read from log fifo: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task @module=logmon error="read /var/lib/nomad/alloc/211617fc-9bab-b136-8aaf-f9acf6a9cff8/alloc/logs/.redis-task.stderr.fifo: file already closed" timestamp=2024-03-16T00:41:14.015Z
2024-03-16T00:41:14.016Z [WARN] client.alloc_runner.task_runner.task_hook.logmon.nomad: failed to read from log fifo: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task @module=logmon error="read /var/lib/nomad/alloc/211617fc-9bab-b136-8aaf-f9acf6a9cff8/alloc/logs/.redis-task.stdout.fifo: file already closed" timestamp=2024-03-16T00:41:14.015Z
2024-03-16T00:41:14.020Z [ERROR] client.alloc_runner.task_runner.task_hook.logmon: error encountered while scanning stdout: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task error="read |0: file already closed"
2024-03-16T00:41:14.020Z [INFO] client.alloc_runner.task_runner.task_hook.logmon: plugin process exited: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task plugin=/bin/nomad id=415
2024-03-16T00:41:14.020Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: plugin exited: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task
2024-03-16T00:41:14.021Z [DEBUG] client.alloc_runner.task_runner: task run loop exiting: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task
2024-03-16T00:41:14.021Z [DEBUG] client.alloc_runner.task_runner.task_hook.logmon.stdio: received EOF, stopping recv loop: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8 task=redis-task err="rpc error: code = Canceled desc = context canceled"
2024-03-16T00:41:14.022Z [INFO] client.gc: marking allocation for GC: alloc_id=211617fc-9bab-b136-8aaf-f9acf6a9cff8
2024-03-16T00:41:14.111Z [DEBUG] nomad.client: adding evaluations for rescheduling failed allocations: num_evals=1
2024-03-16T00:41:14.115Z [DEBUG] http: request complete: method=GET path="/v1/deployment/f6242d4e-3005-a367-e46d-9c0b9db297e2?index=34&stale=" duration=2.921838928s
2024-03-16T00:41:14.116Z [DEBUG] worker: dequeued evaluation: worker_id=1db92c8c-e55b-c558-4109-5a0926993be7 eval_id=f5406140-3663-7b52-a878-9acadd78f84d type=service namespace=default job_id=redis node_id="" triggered_by=alloc-failure
At the same time, I am NOT getting lot of useful information from containerd on why it is failing.
BTW this is from console, so some of the logs are mixed
Here is what is seen when the CNI create stuffs there
[ 169.129212] nomad: port 1(veth285e1ff4) entered blocking state
[ 169.130936] nomad: port 1(veth285e1ff4) entered disabled state
[ 169.136774] device veth285e1ff4 entered promiscuous mode
[ 169.147776] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[ 169.149754] IPv6: ADDRCONF(NETDEV_CHANGE): veth285e1ff4: link becomes ready
[ 169.152408] nomad: port 1(veth285e1ff4) entered blocking state
[ 169.154518] nomad: port 1(veth285e1ff4) entered forwarding state
Now containerd got the reaquest to start the container
DEBU[2024-03-16T00:39:37.221298797Z] (*service).Write started expected="sha256:1b503bb77079ba644371969e06e1a6a1670bb34c2251107c0fc3a21ef9fdaeca" ref="index-sha256:1b503=
DEBU[2024-03-16T00:39:37.328364216Z] (*service).Write started expected="sha256:c1ac6782927e574394225a790b6eb476154d1a16681b1374c62625d9bc324b18" ref="manifest-sha256:c1=
DEBU[2024-03-16T00:39:37.673977279Z] (*service).Write started expected="sha256:287766fc4fcfb5a477bf837560f595de61e82359f79381b13ad83581646ddb42" ref="config-sha256:2877=
DEBU[2024-03-16T00:39:37.909513172Z] stat snapshot key="sha256:d4fc045c9e3a848011de66f34b81f052d4f2c15a17bb196d637e526349601820"
DEBU[2024-03-16T00:39:37.915924017Z] prepare snapshot key="extract-915107854-TFUC sha256:d4fc045c9e3a848011de66f34b81f052d4f2c15a17bb196d637e526349601820" paren=
DEBU[2024-03-16T00:39:37.924000400Z] event published ns=nomad topic=/snapshot/prepare type=containerd.events.SnapshotPrepare
DEBU[2024-03-16T00:39:37.931596920Z] (*service).Write started expected="sha256:4f4fb700ef54461cfa02571ae0db9a0dc1e0cdb5577484a6d75e68dc3
......
(lot of these logs are of not much helpful to debug this)
time="2024-03-16T00:39:40.181017627Z" level=info msg="loading plugin \"io.containerd.event.v1.publisher\"..." runtime=io.containerd.runc.v2 type=io.containerd.event.v1
time="2024-03-16T00:39:40.181188895Z" level=info msg="loading plugin \"io.containerd.internal.v1.shutdown\"..." runtime=io.containerd.runc.v2 type=io.containerd.internal.v1
time="2024-03-16T00:39:40.181202168Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.task\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
time="2024-03-16T00:39:40.181339967Z" level=debug msg="registering ttrpc service" id=io.containerd.ttrpc.v1.task
time="2024-03-16T00:39:40.181355435Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.pause\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
time="2024-03-16T00:39:40.181372659Z" level=debug msg="registering ttrpc service" id=io.containerd.ttrpc.v1.pause
time="2024-03-16T00:39:40.184955521Z" level=debug msg="serving api on socket" socket="[inherited from parent]"
Now here is what we see next - if you see below - directly it shows "failed to delete task" - where are the logs attempting starting the task? Maybe its not reaching to containerd somehow, need to see whats happening on the containerd driver
DEBU[2024-03-16T00:40:11.137507266Z] failed to delete task error="rpc error: code = NotFound desc = container not created: not found" id=redis-task-9b7f110d-be0c-6cd8
INFO[2024-03-16T00:40:11.144113795Z] shim disconnected id=redis-task-9b7f110d-be0c-6cd8-511a-a05bb24a1a1d namespace=nomad
WARN[2024-03-16T00:40:11.147785055Z] cleaning up after shim disconnected id=redis-task-9b7f110d-be0c-6cd8-511a-a05bb24a1a1d namespace=nomad
INFO[2024-03-16T00:40:11.151670911Z] cleaning up dead shim namespace=nomad
WARN[2024-03-16T00:40:11.170792818Z] cleanup warnings time="2024-03-16T00:40:11Z" level=debug msg="starting signal loop" namespace=nomad pid=1138 runtime=io.containerd.runc.v2
time="2024-03-16T00:40:11Z" level=warning msg="failed to read init pid file" error="open /run/containerd/io.containerd.runtime.v2.task/nomad/redis-task-9b7f110d-be0c-6cd8-511a-a05bb24a1a1d/=
ERRO[2024-03-16T00:40:11.180667544Z] copy shim log error="read /proc/self/fd/12: file already closed" namespace=nomad