glusterd2
glusterd2 copied to clipboard
Inconsistency in glustershd process
Observed behavior
- Self-heal is running when there are no volumes/ volume is in stopped state
[root@dhcp35-30 ~]# glustercli volume list
No volumes found
[root@dhcp35-30 ~]# ps aux | grep gluster
root 19068 0.0 1.1 1159428 20712 ? Ssl Jan03 1:30 /usr/sbin/glusterfs -s localhost --volfile-server-port 24007 --volfile-id gluster/glustershd -p /var/run/glusterd2/glustershd.pid -l /var/log/glusterd2/glusterfs/glustershd.log -S /var/run/glusterd2/shd-15f79c88ec2bcf27.socket --xlator-option *replicate*.node-uuid=2115479a-c493-4b44-9119-aa78b0dfcd5e
root 25752 0.8 1.0 502444 20024 ? Ssl 12:24 0:06 /usr/sbin/glusterd2 --config=/etc/glusterd2/glusterd2.toml
root 25779 0.0 0.0 112708 976 pts/0 S+ 12:37 0:00 grep --color=auto gluster
[root@dhcp35-30 ~]# glustercli volume stop rj
Volume rj stopped successfully
[root@dhcp35-30 ~]# ps aux | grep gluster
root 19068 0.0 1.1 1159428 20712 ? Ssl Jan03 1:42 /usr/sbin/glusterfs -s localhost --volfile-server-port 24007 --volfile-id gluster/glustershd -p /var/run/glusterd2/glustershd.pid -l /var/log/glusterd2/glusterfs/glustershd.log -S /var/run/glusterd2/shd-15f79c88ec2bcf27.socket --xlator-option *replicate*.node-uuid=2115479a-c493-4b44-9119-aa78b0dfcd5e
root 25752 0.8 1.1 502444 21968 ? Ssl 12:24 0:13 /usr/sbin/glusterd2 --config=/etc/glusterd2/glusterd2.toml
root 25883 0.0 0.0 112708 980 pts/0 S+ 12:49 0:00 grep --color=auto gluster
- Self-heal is not running on one peer even after explicity triggering heal(created & started the new volume)
[root@dhcp35-30 ~]# glustercli volume create rj replica 3 10.70.35.30:/bricks/brick1/rj0 10.70.35.106:/bricks/brick1/rj1 10.70.35.240:/bricks/brick1/rj2 10.70.35.30:/bricks/brick1/rj3 10.70.35.106:/bricks/brick1/rj4 10.70.35.240:/bricks/brick1/rj5 --create-brick-dir
rj Volume created successfully
Volume ID: b315490e-49de-4edf-afd1-bc3ba56c63f6
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]# glustercli volume start rj
Volume rj started successfully
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]# curl -X POST http://localhost:24007/v1/volumes/rj/heal
{"errors":[{"code":2,"message":"a txn step failed","fields":{"error":"dial unix /var/run/glusterd2/shd-1094f5f3ada7ccb2.socket: connect: no such file or directory","peer-id":"ca3f8907-4db7-4cff-aaf7-41f916ea91a1","step":"selfheal.Heal"}}]}
Writing down all the tests and their following command outputs
[root@dhcp35-30 ~]# glustercli volume info
Volume Name: rj
Type: Distributed-Replicate
Volume ID: b315490e-49de-4edf-afd1-bc3ba56c63f6
State: Started
Transport-type: tcp
Options:
performance/quick-read.quick-read: off
performance/read-ahead.read-ahead: off
performance/readdir-ahead.readdir-ahead: off
performance/write-behind.write-behind: off
cluster/replicate.self-heal-daemon: on
performance/io-cache.io-cache: off
performance/md-cache.md-cache: off
performance/open-behind.open-behind: off
Number of Bricks: 2 x 3 = 6
Brick1: 10.70.35.30:/bricks/brick1/rj0
Brick2: 10.70.35.106:/bricks/brick1/rj1
Brick3: 10.70.35.240:/bricks/brick1/rj2
Brick4: 10.70.35.30:/bricks/brick1/rj3
Brick5: 10.70.35.106:/bricks/brick1/rj4
Brick6: 10.70.35.240:/bricks/brick1/rj5
[root@dhcp35-30 ~]# glustercli volume stop rj
Volume rj stopped successfully
[root@dhcp35-30 ~]# ps aux | grep gluster
root 19068 0.0 1.1 1159428 20712 ? Ssl Jan03 1:42 /usr/sbin/glusterfs -s localhost --volfile-server-port 24007 --volfile-id gluster/glustershd -p /var/run/glusterd2/glustershd.pid -l /var/log/glusterd2/glusterfs/glustershd.log -S /var/run/glusterd2/shd-15f79c88ec2bcf27.socket --xlator-option *replicate*.node-uuid=2115479a-c493-4b44-9119-aa78b0dfcd5e
root 25752 0.8 1.1 502444 21968 ? Ssl 12:24 0:13 /usr/sbin/glusterd2 --config=/etc/glusterd2/glusterd2.toml
root 25883 0.0 0.0 112708 980 pts/0 S+ 12:49 0:00 grep --color=auto gluster
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]# kill -9 19068
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]# glustercli volume create testvol1 replica 3 10.70.35.30:/bricks/brick1/rj6 10.70.35.106:/bricks/brick1/rj7 10.70.35.240:/bricks/brick1/rj8 10.70.35.30:/bricks/brick1/rj9 10.70.35.106:/bricks/brick1/rj10 10.70.35.240:/bricks/brick1/rj11 --create-brick-dir
testvol1 Volume created successfully
Volume ID: b13e81a1-df35-4af2-b00f-9dd9b0c0ffd2
[root@dhcp35-30 ~]# glustercli volume start testvol1
Volume testvol1 started successfully
[root@dhcp35-30 ~]# ps aux|grep gluster
root 25752 0.8 1.2 502444 23444 ? Ssl 12:24 0:14 /usr/sbin/glusterd2 --config=/etc/glusterd2/glusterd2.toml
root 25901 0.5 0.5 1637368 9864 ? Ssl 12:52 0:00 /usr/sbin/glusterfsd --volfile-server 127.0.0.1 --volfile-server-port 24007 --volfile-id testvol1.2115479a-c493-4b44-9119-aa78b0dfcd5e.bricks-brick1-rj6 -p /var/run/glusterd2/2115479a-c493-4b44-9119-aa78b0dfcd5e-bricks-brick1-rj6.pid -S /var/run/glusterd2/86a2491f6fd592ef.socket --brick-name /bricks/brick1/rj6 -l /var/log/glusterd2/glusterfs/bricks/bricks-brick1-rj6.log --xlator-option *-posix.glusterd-uuid=2115479a-c493-4b44-9119-aa78b0dfcd5e
root 25935 0.0 0.0 112708 980 pts/0 S+ 12:52 0:00 grep --color=auto gluster
[root@dhcp35-30 ~]# glustercli volume info
Volume Name: rj
Type: Distributed-Replicate
Volume ID: b315490e-49de-4edf-afd1-bc3ba56c63f6
State: Stopped
Transport-type: tcp
Options:
cluster/replicate.self-heal-daemon: on
performance/io-cache.io-cache: off
performance/md-cache.md-cache: off
performance/open-behind.open-behind: off
performance/quick-read.quick-read: off
performance/read-ahead.read-ahead: off
performance/readdir-ahead.readdir-ahead: off
performance/write-behind.write-behind: off
Number of Bricks: 2 x 3 = 6
Brick1: 10.70.35.30:/bricks/brick1/rj0
Brick2: 10.70.35.106:/bricks/brick1/rj1
Brick3: 10.70.35.240:/bricks/brick1/rj2
Brick4: 10.70.35.30:/bricks/brick1/rj3
Brick5: 10.70.35.106:/bricks/brick1/rj4
Brick6: 10.70.35.240:/bricks/brick1/rj5
Volume Name: testvol1
Type: Distributed-Replicate
Volume ID: b13e81a1-df35-4af2-b00f-9dd9b0c0ffd2
State: Started
Transport-type: tcp
Options:
performance/read-ahead.read-ahead: off
performance/readdir-ahead.readdir-ahead: off
performance/write-behind.write-behind: off
cluster/replicate.self-heal-daemon: on
performance/io-cache.io-cache: off
performance/md-cache.md-cache: off
performance/open-behind.open-behind: off
performance/quick-read.quick-read: off
Number of Bricks: 2 x 3 = 6
Brick1: 10.70.35.30:/bricks/brick1/rj6
Brick2: 10.70.35.106:/bricks/brick1/rj7
Brick3: 10.70.35.240:/bricks/brick1/rj8
Brick4: 10.70.35.30:/bricks/brick1/rj9
Brick5: 10.70.35.106:/bricks/brick1/rj10
Brick6: 10.70.35.240:/bricks/brick1/rj11
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]#
[root@dhcp35-30 ~]# systemctl status glusterd2
● glusterd2.service - GlusterD2, the management service for GlusterFS (pre-release)
Loaded: loaded (/usr/lib/systemd/system/glusterd2.service; enabled; vendor preset: disabled)
Active: active (running) since Thu 2019-01-10 12:24:18 IST; 28min ago
Main PID: 25752 (glusterd2)
CGroup: /system.slice/glusterd2.service
├─25752 /usr/sbin/glusterd2 --config=/etc/glusterd2/glusterd2.toml
└─25901 /usr/sbin/glusterfsd --volfile-server 127.0.0.1 --volfile-server-port 24007 --volf...
Jan 10 12:24:18 dhcp35-30.lab.eng.blr.redhat.com systemd[1]: Started GlusterD2, the management serv...).
Jan 10 12:24:18 dhcp35-30.lab.eng.blr.redhat.com glusterd2[25752]: time="2019-01-10T12:24:18+05:30" ...k
Jan 10 12:24:18 dhcp35-30.lab.eng.blr.redhat.com glusterd2[25752]: time="2019-01-10T12:24:18+05:30" ...k
Hint: Some lines were ellipsized, use -l to show in full.
[root@dhcp35-30 ~]# glustercli volume set testvol1 replicate.self-heal-daemon on --advanced
Options set successfully for testvol1 volume
[root@dhcp35-30 ~]# glustercli peer status
+--------------------------------------+-----------------------------------+--------------------+--------------------+--------+-------+
| ID | NAME | CLIENT ADDRESSES | PEER ADDRESSES | ONLINE | PID |
+--------------------------------------+-----------------------------------+--------------------+--------------------+--------+-------+
| 2115479a-c493-4b44-9119-aa78b0dfcd5e | dhcp35-30.lab.eng.blr.redhat.com | 127.0.0.1:24007 | 10.70.35.30:24008 | yes | 25752 |
| | | 10.70.35.30:24007 | | | |
| af6fbeb6-42ac-4080-b06e-bfb61f2b1ffa | dhcp35-106.lab.eng.blr.redhat.com | 127.0.0.1:24007 | 10.70.35.106:24008 | yes | 15661 |
| | | 10.70.35.106:24007 | | | |
| ca3f8907-4db7-4cff-aaf7-41f916ea91a1 | dhcp35-240.lab.eng.blr.redhat.com | 127.0.0.1:24007 | 10.70.35.240:24008 | yes | 16105 |
| | | 10.70.35.240:24007 | | | |
+--------------------------------------+-----------------------------------+--------------------+--------------------+--------+-------+
[root@dhcp35-30 ~]# glustercli volume status
Volume : rj
+--------------------------------------+--------------+--------------------+--------+------+-----+
| BRICK ID | HOST | PATH | ONLINE | PORT | PID |
+--------------------------------------+--------------+--------------------+--------+------+-----+
| a9bd1c3a-aa2f-4331-973d-76052109cdb9 | 10.70.35.30 | /bricks/brick1/rj0 | false | 0 | 0 |
| 2d1de027-4f99-4b02-9147-1c0bca7446a9 | 10.70.35.106 | /bricks/brick1/rj1 | false | 0 | 0 |
| 4137572a-5a4a-4bb7-a3ca-48ce229de802 | 10.70.35.240 | /bricks/brick1/rj2 | false | 0 | 0 |
| e833eac9-0c2b-463e-bcdb-90e4242a4a4b | 10.70.35.30 | /bricks/brick1/rj3 | false | 0 | 0 |
| 34209e13-353d-4efe-8e5d-d9bd5a78f5f0 | 10.70.35.106 | /bricks/brick1/rj4 | false | 0 | 0 |
| f1ae716a-37ab-4f73-92f8-e341e3196ca0 | 10.70.35.240 | /bricks/brick1/rj5 | false | 0 | 0 |
+--------------------------------------+--------------+--------------------+--------+------+-----+
Volume : testvol1
+--------------------------------------+--------------+---------------------+--------+-------+-------+
| BRICK ID | HOST | PATH | ONLINE | PORT | PID |
+--------------------------------------+--------------+---------------------+--------+-------+-------+
| 68c0da28-f723-4497-8e2a-2184f2b11708 | 10.70.35.30 | /bricks/brick1/rj6 | true | 36075 | 25901 |
| 9199e4e5-4e38-498f-9fcc-6111f71fea80 | 10.70.35.106 | /bricks/brick1/rj7 | true | 35484 | 15755 |
| 8d2b8ead-cfc3-4492-b09a-6ba090bb85de | 10.70.35.240 | /bricks/brick1/rj8 | true | 39239 | 16197 |
| 704e4634-0e32-432e-8d3f-f672600cd043 | 10.70.35.30 | /bricks/brick1/rj9 | true | 36075 | 25901 |
| c5fa470e-d10c-4d55-9b45-8a66e7fe6258 | 10.70.35.106 | /bricks/brick1/rj10 | true | 35484 | 15755 |
| 65df0fd3-64bd-4feb-bf58-7b756c52f9c2 | 10.70.35.240 | /bricks/brick1/rj11 | true | 39239 | 16197 |
+--------------------------------------+--------------+---------------------+--------+-------+-------+
[root@dhcp35-30 ~]# glustercli volume stop testvol1
Volume testvol1 stopped successfully
[root@dhcp35-30 ~]# glustercli volume start testvol1
Volume testvol1 started successfully
[root@dhcp35-30 ~]# glustercli volume set testvol1 replicate.self-heal-daemon on --advanced
Options set successfully for testvol1 volume
[root@dhcp35-30 ~]# ps aux|grep gluster
root 25752 0.7 1.5 502444 28416 ? Ssl 12:24 0:15 /usr/sbin/glusterd2 --config=/etc/glusterd2/glusterd2.toml
root 25998 0.3 0.5 1801224 10420 ? Ssl 12:57 0:00 /usr/sbin/glusterfsd --volfile-server 127.0.0.1 --volfile-server-port 24007 --volfile-id testvol1.2115479a-c493-4b44-9119-aa78b0dfcd5e.bricks-brick1-rj6 -p /var/run/glusterd2/2115479a-c493-4b44-9119-aa78b0dfcd5e-bricks-brick1-rj6.pid -S /var/run/glusterd2/86a2491f6fd592ef.socket --brick-name /bricks/brick1/rj6 -l /var/log/glusterd2/glusterfs/bricks/bricks-brick1-rj6.log --xlator-option *-posix.glusterd-uuid=2115479a-c493-4b44-9119-aa78b0dfcd5e
root 26042 0.0 0.0 112708 980 pts/0 S+ 12:57 0:00 grep --color=auto gluster
Expected/desired behavior
Details on how to reproduce (minimal and precise)
Information about the environment:
-
Glusterd2 version used (e.g. v4.1.0 or master): glusterd version: v6.0-dev.108.git79d9987 git SHA: 79d9987 go version: go1.11.2 go OS/arch: linux/amd64
-
Operating system used:
-
Glusterd2 compiled from sources, as a package (rpm/deb), or container:
-
Using External ETCD: (yes/no, if yes ETCD version):
-
If container, which container image:
-
Using kubernetes, openshift, or direct install:
-
If kubernetes/openshift, is gluster running inside kubernetes/openshift or outside:
Other useful information
- glusterd2 config files from all nodes (default /etc/glusterd2/glusterd2.toml)
- glusterd2 log files from all nodes (default /var/log/glusterd2/glusterd2.log)
glusterd2.log glusterd2.log glusterd2.log
- ETCD configuration
- Contents of
uuid.toml
from all nodes (default /var/lib/glusterd2/uuid.toml) - Output of
statedump
from any one of the node
Useful commands
- To get glusterd2 version
glusterd2 --version
- To get ETCD version
etcd --version
- To get output of statedump
curl http://glusterd2-IP:glusterd2-Port/statedump
The revised scope of GCS/1.0 is to make client self heal to heal the files when a file is accessed. glustershd piece needs some work on the shd multiplexing to make the overall work completed which isn't under the scope of GCS/1.0 and hence taking the tag out.