Docker Swarm's DNS server in overlay network is resolving container hostnames to incorrect IP addresses
- [x] This is a bug report
- [ ] This is a feature request
- [x] I searched existing issues before opening this one
Expected behavior
When deploying containers via docker swarm deploy using a Docker Compose file with hostnames for the containers, each container should be able to resolve the other containers via hostname.
Actual behavior
Containers are getting valid ipv4 addresses and can ping one another, but hostnames are resolving to different IP addresses on the same subnet.
Steps to reproduce the behavior
Create four LXD containers running the latest version of Docker from the official Docker repository for Ubuntu 18.04. Create a swarm.
Deploy containers to the swarm with the following docker-compose.yml file via docker stack deploy -c docker-compose.yml dgraph:
version: "3"
networks:
dgraph:
services:
zero:
image: dgraph/dgraph:latest
hostname: "zero"
volumes:
- data-volume:/dgraph
ports:
- "5080:5080"
- "6080:6080"
networks:
- dgraph
deploy:
placement:
constraints:
- node.hostname == dg-zero
command: dgraph zero --my=zero:5080 --replicas 3 --bindall=true
alpha1:
image: dgraph/dgraph:latest
hostname: "alpha1"
volumes:
- data-volume:/dgraph
ports:
- "8080:8080"
- "9080:9080"
networks:
- dgraph
deploy:
placement:
constraints:
- node.hostname == dg-alpha1
command: dgraph alpha --my=alpha1:7080 --lru_mb=1024 --zero=zero:5080 --bindall=true
alpha2:
image: dgraph/dgraph:latest
hostname: "alpha2"
volumes:
- data-volume:/dgraph
ports:
- "8081:8081"
- "9081:9081"
networks:
- dgraph
deploy:
placement:
constraints:
- node.hostname == dg-alpha2
command: dgraph alpha --my=alpha2:7081 --lru_mb=1024 --zero=zero:5080 -o 1 --bindall=true
alpha3:
image: dgraph/dgraph:latest
hostname: "alpha3"
volumes:
- data-volume:/dgraph
ports:
- "8082:8082"
- "9082:9082"
networks:
- dgraph
deploy:
placement:
constraints:
- node.hostname == dg-alpha3
command: dgraph alpha --my=alpha3:7082 --lru_mb=1024 --zero=zero:5080 -o 2 --bindall=true
ratel:
image: dgraph/dgraph:latest
hostname: "ratel"
ports:
- "8000:8000"
networks:
- dgraph
command: dgraph-ratel
volumes:
data-volume:
On the master node holding the zero container, run docker container inspect on the zero container and note the IP address. In the case:
# docker container inspect dgraph_zero.1.3y2tug5z41n5igzpawf9gyyqe
[
{
"Id": "8b48711ab0cd485fdde4e89f56a1251162afdc22836614d0263a42a7eab86618",
"Created": "2020-01-14T01:14:29.011528658Z",
"Path": "dgraph",
"Args": [
"zero",
"--my=zero:5080",
"--replicas",
"3",
"--bindall=true"
],
"State": {
"Status": "running",
"Running": true,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 519,
"ExitCode": 0,
"Error": "",
"StartedAt": "2020-01-14T01:14:59.650768268Z",
"FinishedAt": "0001-01-01T00:00:00Z"
},
"Image": "sha256:18b436d209e55597814d311befe9d17fc1730eb5cf57e393fbbabc98aea9d037",
"ResolvConfPath": "/var/lib/docker/containers/8b48711ab0cd485fdde4e89f56a1251162afdc22836614d0263a42a7eab86618/resolv.conf",
"HostnamePath": "/var/lib/docker/containers/8b48711ab0cd485fdde4e89f56a1251162afdc22836614d0263a42a7eab86618/hostname",
"HostsPath": "/var/lib/docker/containers/8b48711ab0cd485fdde4e89f56a1251162afdc22836614d0263a42a7eab86618/hosts",
"LogPath": "/var/lib/docker/containers/8b48711ab0cd485fdde4e89f56a1251162afdc22836614d0263a42a7eab86618/8b48711ab0cd485fdde4e89f56a1251162afdc22836614d0263a42a7eab86618-json.log",
"Name": "/dgraph_zero.1.3y2tug5z41n5igzpawf9gyyqe",
"RestartCount": 0,
"Driver": "btrfs",
"Platform": "linux",
"MountLabel": "",
"ProcessLabel": "",
"AppArmorProfile": "docker-default",
"ExecIDs": null,
"HostConfig": {
"Binds": null,
"ContainerIDFile": "",
"LogConfig": {
"Type": "json-file",
"Config": {}
},
"NetworkMode": "default",
"PortBindings": {},
"RestartPolicy": {
"Name": "",
"MaximumRetryCount": 0
},
"AutoRemove": false,
"VolumeDriver": "",
"VolumesFrom": null,
"CapAdd": null,
"CapDrop": null,
"Capabilities": null,
"Dns": null,
"DnsOptions": null,
"DnsSearch": null,
"ExtraHosts": null,
"GroupAdd": null,
"IpcMode": "private",
"Cgroup": "",
"Links": null,
"OomScoreAdj": 0,
"PidMode": "",
"Privileged": false,
"PublishAllPorts": false,
"ReadonlyRootfs": false,
"SecurityOpt": null,
"UTSMode": "",
"UsernsMode": "",
"ShmSize": 67108864,
"Runtime": "runc",
"ConsoleSize": [
0,
0
],
"Isolation": "default",
"CpuShares": 0,
"Memory": 0,
"NanoCpus": 0,
"CgroupParent": "",
"BlkioWeight": 0,
"BlkioWeightDevice": null,
"BlkioDeviceReadBps": null,
"BlkioDeviceWriteBps": null,
"BlkioDeviceReadIOps": null,
"BlkioDeviceWriteIOps": null,
"CpuPeriod": 0,
"CpuQuota": 0,
"CpuRealtimePeriod": 0,
"CpuRealtimeRuntime": 0,
"CpusetCpus": "",
"CpusetMems": "",
"Devices": null,
"DeviceCgroupRules": null,
"DeviceRequests": null,
"KernelMemory": 0,
"KernelMemoryTCP": 0,
"MemoryReservation": 0,
"MemorySwap": 0,
"MemorySwappiness": null,
"OomKillDisable": false,
"PidsLimit": null,
"Ulimits": null,
"CpuCount": 0,
"CpuPercent": 0,
"IOMaximumIOps": 0,
"IOMaximumBandwidth": 0,
"Mounts": [
{
"Type": "volume",
"Source": "dgraph_data-volume",
"Target": "/dgraph",
"VolumeOptions": {
"Labels": {
"com.docker.stack.namespace": "dgraph"
}
}
}
],
"MaskedPaths": [
"/proc/asound",
"/proc/acpi",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/proc/scsi",
"/sys/firmware"
],
"ReadonlyPaths": [
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
]
},
"GraphDriver": {
"Data": null,
"Name": "btrfs"
},
"Mounts": [
{
"Type": "volume",
"Name": "dgraph_data-volume",
"Source": "/var/lib/docker/volumes/dgraph_data-volume/_data",
"Destination": "/dgraph",
"Driver": "local",
"Mode": "z",
"RW": true,
"Propagation": ""
}
],
"Config": {
"Hostname": "zero",
"Domainname": "",
"User": "",
"AttachStdin": false,
"AttachStdout": false,
"AttachStderr": false,
"ExposedPorts": {
"8080/tcp": {},
"9080/tcp": {}
},
"Tty": false,
"OpenStdin": false,
"StdinOnce": false,
"Env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
],
"Cmd": [
"dgraph",
"zero",
"--my=zero:5080",
"--replicas",
"3",
"--bindall=true"
],
"Image": "dgraph/dgraph:latest@sha256:94928027e5e299e836129b9f9e66e34628ec3bd1e05fee1ce368ea0d0cc4e152",
"Volumes": null,
"WorkingDir": "/dgraph",
"Entrypoint": null,
"OnBuild": null,
"Labels": {
"com.docker.stack.namespace": "dgraph",
"com.docker.swarm.node.id": "m5u3d5cx69hxmvf04tut8vubq",
"com.docker.swarm.service.id": "uvt9alocatqm0lzhtrm1l6hpp",
"com.docker.swarm.service.name": "dgraph_zero",
"com.docker.swarm.task": "",
"com.docker.swarm.task.id": "3y2tug5z41n5igzpawf9gyyqe",
"com.docker.swarm.task.name": "dgraph_zero.1.3y2tug5z41n5igzpawf9gyyqe"
}
},
"NetworkSettings": {
"Bridge": "",
"SandboxID": "4548013a15833d086b281a8c2dd61ced6ea5c92f815a305f7337effe9b04a13a",
"HairpinMode": false,
"LinkLocalIPv6Address": "",
"LinkLocalIPv6PrefixLen": 0,
"Ports": {
"8080/tcp": null,
"9080/tcp": null
},
"SandboxKey": "/var/run/docker/netns/4548013a1583",
"SecondaryIPAddresses": null,
"SecondaryIPv6Addresses": null,
"EndpointID": "",
"Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"MacAddress": "",
"Networks": {
"dgraph_dgraph": {
"IPAMConfig": {
"IPv4Address": "10.0.9.3"
},
"Links": null,
"Aliases": [
"8b48711ab0cd"
],
"NetworkID": "lve3kr9vm42rwu1nci897zey7",
"EndpointID": "056ae62475da805ec212d9ec2b2e4a5c9e09e2405c15ad6e8b298e90669b512d",
"Gateway": "",
"IPAddress": "10.0.9.3",
"IPPrefixLen": 24,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"MacAddress": "02:42:0a:00:09:03",
"DriverOpts": null
},
"ingress": {
"IPAMConfig": {
"IPv4Address": "10.0.0.157"
},
"Links": null,
"Aliases": [
"8b48711ab0cd"
],
"NetworkID": "vjhpbsc1766lbvtu169fmh81l",
"EndpointID": "29bbc4de97e98b2e05a46dd42020dd1fbb75ff07d8c08a00b8ba6f2f4e00ec2a",
"Gateway": "",
"IPAddress": "10.0.0.157",
"IPPrefixLen": 24,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"MacAddress": "02:42:0a:00:00:9d",
"DriverOpts": null
}
}
}
}
]
The IP address on the overlay network is 10.0.9.3
Now move to an LXD node running a worker container and run docker exec -it dgraph_alpha1 /bin/ping zero where dgraph_alpha1 is the name of your worker container and zero is the hostname of your manager container.
# docker exec -it dgraph_alpha2.1.s215ookzk3r3uwqg7e3a3dyf5 /bin/ping zero
PING zero (10.0.9.2) 56(84) bytes of data.
From alpha2 (10.0.9.19) icmp_seq=1 Destination Host Unreachable
You can see the hostname zero is resolving to 10.0.9.2 instead of the correct 10.0.9.3 address. In my tests the resolved IP address' last octet is always one lower than the actual IP. All DNS entries are consistently off by a seemingly reproducable number. When directly pinging another container's actual IP address the connection is successful.
Same result using dig:
root@dg-alpha1 ~# docker exec -it dgraph_alpha1.1.0znc2x5fzobvsicjzxs8eri4b /bin/ping zero
PING zero (10.0.10.5) 56(84) bytes of data.
From alpha1 (10.0.10.8) icmp_seq=1 Destination Host Unreachable
From alpha1 (10.0.10.8) icmp_seq=2 Destination Host Unreachable
From alpha1 (10.0.10.8) icmp_seq=3 Destination Host Unreachable
From alpha1 (10.0.10.8) icmp_seq=4 Destination Host Unreachable
root@dg-zero ~# docker exec -it dgraph_zero.1.qyo6sul9ud1fg0kkq2evpsnnr /usr/bin/dig alpha1
; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> alpha1
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 1735
;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0
;; QUESTION SECTION:
;alpha1. IN A
;; ANSWER SECTION:
alpha1. 600 IN A 10.0.10.7
;; Query time: 0 msec
;; SERVER: 127.0.0.11#53(127.0.0.11)
;; WHEN: Wed Jan 15 18:30:49 UTC 2020
;; MSG SIZE rcvd: 46
Output of docker version:
Client: Docker Engine - Community
Version: 19.03.5
API version: 1.40
Go version: go1.12.12
Git commit: 633a0ea838
Built: Wed Nov 13 07:29:52 2019
OS/Arch: linux/amd64
Experimental: false
Server: Docker Engine - Community
Engine:
Version: 19.03.5
API version: 1.40 (minimum version 1.12)
Go version: go1.12.12
Git commit: 633a0ea838
Built: Wed Nov 13 07:28:22 2019
OS/Arch: linux/amd64
Experimental: false
containerd:
Version: 1.2.10
GitCommit: b34a5c8af56e510852c35414db4c1f4fa6172339
runc:
Version: 1.0.0-rc8+dev
GitCommit: 3e425f80a8c931f88e6d94a8c831b9d5aa481657
docker-init:
Version: 0.18.0
GitCommit: fec3683
Output of docker info:
Client:
Debug Mode: false
Server:
Containers: 34
Running: 2
Paused: 0
Stopped: 32
Images: 1
Server Version: 19.03.5
Storage Driver: btrfs
Build Version: Btrfs v4.15.1
Library Version: 102
Logging Driver: json-file
Cgroup Driver: cgroupfs
Plugins:
Volume: local
Network: bridge host ipvlan macvlan null overlay
Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog
Swarm: active
NodeID: m5u3d5cx69hxmvf04tut8vubq
Is Manager: true
ClusterID: jpz7lve7hotqagxklw54i3c21
Managers: 1
Nodes: 4
Default Address Pool: 10.0.0.0/8
SubnetSize: 24
Data Path Port: 4789
Orchestration:
Task History Retention Limit: 5
Raft:
Snapshot Interval: 10000
Number of Old Snapshots to Retain: 0
Heartbeat Tick: 1
Election Tick: 10
Dispatcher:
Heartbeat Period: 5 seconds
CA Configuration:
Expiry Duration: 3 months
Force Rotate: 0
Autolock Managers: false
Root Rotation In Progress: false
Node Address: 10.23.237.135
Manager Addresses:
10.23.237.135:2377
Runtimes: runc
Default Runtime: runc
Init Binary: docker-init
containerd version: b34a5c8af56e510852c35414db4c1f4fa6172339
runc version: 3e425f80a8c931f88e6d94a8c831b9d5aa481657
init version: fec3683
Security Options:
apparmor
seccomp
Profile: default
Kernel Version: 4.15.0-74-generic
Operating System: Ubuntu 18.04.3 LTS
OSType: linux
Architecture: x86_64
CPUs: 4
Total Memory: 14.33GiB
Name: dg-zero
ID: RUHN:ZPB2:PRMQ:6GJJ:E6LF:T2B6:WDDO:P3HN:MTK5:P6UR:GGHY:DRXK
Docker Root Dir: /var/lib/docker
Debug Mode: false
Registry: https://index.docker.io/v1/
Labels:
Experimental: false
Insecure Registries:
127.0.0.0/8
Live Restore Enabled: false
WARNING: No swap limit support
Additional environment details (AWS, VirtualBox, physical, etc.)
Test environment topography is as follows:
- Host: KDE Neon workstation
- LXD Container: zero
- Docker Node: dg-zero
- Docker Container: dgraph_zero
- Docker Container: dgraph_ratel
- Docker Node: dg-zero
- LXD Container: alpha1
- Docker Node: dg-alpha1
- Docker Container: dgraph_alpha1
- Docker Node: dg-alpha1
- LXD Container: alpha2
- Docker Node: dg-alpha2
- Docker Container: dgraph_alpha2
- Docker Node: dg-alpha2
- LXD Container: alpha3
- Docker Node: dg-alpha3
- Docker Container: dgraph_alpha3
- Docker Node: dg-alpha3
- LXD Container: zero
root@dg-zero ~# docker network inspect dgraph_dgraph
[
{
"Name": "dgraph_dgraph",
"Id": "gqj44vbxlc0tmqnewzibr5o1w",
"Created": "2020-01-15T18:21:43.493137495Z",
"Scope": "swarm",
"Driver": "overlay",
"EnableIPv6": false,
"IPAM": {
"Driver": "default",
"Options": null,
"Config": [
{
"Subnet": "10.0.10.0/24",
"Gateway": "10.0.10.1"
}
]
},
"Internal": false,
"Attachable": false,
"Ingress": false,
"ConfigFrom": {
"Network": ""
},
"ConfigOnly": false,
"Containers": {
"25e052e62e8aed37e51bd00ddad612501f85f410a00cf45e447ad2f36b492207": {
"Name": "dgraph_ratel.1.pm0o9wcyd9yycb8km0d3wtebq",
"EndpointID": "5afb0292abff9f8cb9a5cf4bc81cc621916f063e16ce6f5ede927b50781592ab",
"MacAddress": "02:42:0a:00:0a:03",
"IPv4Address": "10.0.10.3/24",
"IPv6Address": ""
},
"b82495c5977d4639532fbda1d9d874f093aa8e400ff16f08ec610de8ae1452b4": {
"Name": "dgraph_zero.1.qyo6sul9ud1fg0kkq2evpsnnr",
"EndpointID": "be4fee0812619c7bcf0d7b3e761d910b96405df6ee1dda3f44f8b3b976030f97",
"MacAddress": "02:42:0a:00:0a:06",
"IPv4Address": "10.0.10.6/24",
"IPv6Address": ""
},
"lb-dgraph_dgraph": {
"Name": "dgraph_dgraph-endpoint",
"EndpointID": "1b8adcdb540f9fbffafce80d0a9765daaaec770dee36b3d86d5eadd2338eac1a",
"MacAddress": "02:42:0a:00:0a:04",
"IPv4Address": "10.0.10.4/24",
"IPv6Address": ""
}
},
"Options": {
"com.docker.network.driver.overlay.vxlanid_list": "4106"
},
"Labels": {
"com.docker.stack.namespace": "dgraph"
},
"Peers": [
{
"Name": "4be3e8ce2a93",
"IP": "10.23.237.135"
},
{
"Name": "021a7324b83c",
"IP": "10.23.237.71"
},
{
"Name": "187ed43c46dc",
"IP": "10.23.237.144"
},
{
"Name": "b057d160698a",
"IP": "10.23.237.155"
}
]
}
]
UFW state is inactive. No iptables rules are present other than the defaults created by Docker. The only running firewall on the network is on the WAN and all containers are within the LAN. LXD containers can ping one another and they are all present as peers via docker inspect.
Try to ping the service name, i.e., dgraph_zero, instead of the hostname. That should resolve to the stable virtual IP of the service (VIP).
I'm having the same issue when using multiple networks for a service. My stack file is as follows:
version: "3"
services:
nginx:
image: ${registry}/siteipc_nginx:develop-dev
volumes:
- nfs_codebase:/var/www/html/
- nginx_logs:/var/log/nginx/
- nginx_confs:/etc/nginx/conf.d/
networks:
- public
deploy:
placement:
constraints: [node.labels.mylabel == true]
solr:
image: ${registry}/siteipc_solr:develop-dev
environment:
SOLR_DEFAULT_CONFIG_SET: search_api_solr_4.1.6
SOLR_HEAP: 512m
networks:
- private
- public
volumes:
- solr_index:/opt/solr/server/solr
deploy:
placement:
constraints: [node.labels.otherlabel == true]
resources:
limits:
cpus: '1.0'
memory: 1400M
reservations:
cpus: '0.1'
memory: 700M
networks:
private:
public:
external:
name: "nginx_prod_public"
The solr service cannot be resolved to the right IP address if it is in multiple networks, but if I remove one of them, then it starts to work properly.
I tryed to ping the service name as @neben suggested, but it did not help. The only work around was removing one the networks.
Any ideas?
Did this ever got resolved? Seeing the same behaviour..
Did anyone manage to figure it out? Cheers!