[BUG] Two replicas of one shard never finish syncing.
Describe the bug Two replicas of one shard never finish syncing:
# Replication
role:master
connected_slaves:2
slave0:ip=10.136.129.145,port=6379,state=wait_bgsave,offset=0,lag=0
slave1:ip=10.136.243.146,port=6379,state=wait_bgsave,offset=0,lag=0
To reproduce
create cluster and start re-attaching replicas and runs failovers manually.
Expected behavior After topology changes, replicas should complete syncing from primary node.
Additional information
cluster nodes:
skolosov@C02F34L1MD6R caching-infra % kubectl exec -it rediscluster-update-test-88-6tq2m -n test -c redis-node -- keydb-cli cluster nodes
f30037c4afef7a5ba19c810b1b4768bc8946b53d 10.136.241.13:6379@16379 master,fail - 1695158950093 1695158949088 15 connected
b45c4bed6b978e18f0340851c5af69e9cd2650c1 10.136.244.141:6379@16379 slave 3552bbd8ed39dc1322dcfd43827fc2d65f1729db 0 1695161717036 13 connected
3552bbd8ed39dc1322dcfd43827fc2d65f1729db 10.136.240.211:6379@16379 master - 0 1695161717136 13 connected 0-4095
e8fdb8f41cbc4452f5d1fba78fd48c7b0ff1dd3a 10.136.241.76:6379@16379 slave 3552bbd8ed39dc1322dcfd43827fc2d65f1729db 0 1695161717000 13 connected
65da31685e82c575774b7d3345df79c004658bbe 10.136.241.216:6379@16379 slave 483c38ddb492e0c84a3d21a6b66925dbecb58f4c 0 1695161717036 16 connected
a0cadd9d591882282267cd556498a9f1f4368a5b 10.136.244.15:6379@16379 master - 0 1695161717036 12 connected 4096-5461 9558-10923 15020-16383
483c38ddb492e0c84a3d21a6b66925dbecb58f4c 10.136.241.148:6379@16379 master - 0 1695161717036 16 connected 5462-9557
bcb9878aaf5cb6a589b36177102e9f04b6463faa 10.136.129.145:6379@16379 slave a0cadd9d591882282267cd556498a9f1f4368a5b 0 1695161717136 12 connected
1788d52122f37bbb5ccad866dea82be37dee36ea 10.136.243.146:6379@16379 myself,slave a0cadd9d591882282267cd556498a9f1f4368a5b 0 1695161716000 12 connected
9301a11909678d54db209471cfef80dd25b35843 10.136.241.18:6379@16379 master - 0 1695161717136 0 connected
216f289dd37ae8357478b0e3ede4dc28c9b8f8e2 10.136.242.215:6379@16379 slave f0f832c666bf5cc257cba8b0c80c5e0ffa78233f 0 1695161716935 10 connected
f0f832c666bf5cc257cba8b0c80c5e0ffa78233f 10.136.244.79:6379@16379 master - 0 1695161717136 10 connected 10924-15019
42cabc99bb858b19ce43e7b0f9b8661d16318ac1 10.136.245.20:6379@16379 slave 483c38ddb492e0c84a3d21a6b66925dbecb58f4c 0 1695161717000 16 connected
3e7ab941ba58af914e09b2e3bef95efa0adc0909 10.136.129.140:6379@16379 slave f0f832c666bf5cc257cba8b0c80c5e0ffa78233f 0 1695161717036 10 connected
list of K8s pods:
skolosov@C02F34L1MD6R caching-infra % kubectl get pods -n test -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
keydb-operator-64f968bd45-pqpk2 4/4 Running 0 6h 10.136.242.172 gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-tbds <none> <none>
rediscluster-update-test-88-2gtxb 1/1 Running 0 72m 10.136.241.216 gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-xzxq <none> <none>
rediscluster-update-test-88-4cgf6 1/1 Running 0 55m 10.136.244.15 gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q <none> <none>
rediscluster-update-test-88-6tq2m 1/1 Running 0 56m 10.136.243.146 gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw <none> <none>
rediscluster-update-test-88-7bnfj 1/1 Running 0 52m 10.136.240.211 gke-aerospike-skolosov--t--nodepool-1-3ff3f802-dq85 <none> <none>
rediscluster-update-test-88-88mh5 1/1 Running 0 43m 10.136.241.18 gke-aerospike-skolosov--t--nodepool-1-3ff3f802-z7gq <none> <none>
rediscluster-update-test-88-hc5sl 1/1 Running 0 72m 10.136.244.79 gke-aerospike-skolosov--t--nodepool-1-4475920e-d92j <none> <none>
rediscluster-update-test-88-jx7q2 1/1 Running 0 45m 10.136.242.215 gke-aerospike-skolosov--t--nodepool-1-3ff3f802-6nrs <none> <none>
rediscluster-update-test-88-mv7bw 1/1 Running 0 55m 10.136.129.145 gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-z6q4 <none> <none>
rediscluster-update-test-88-mv7g4 1/1 Running 0 72m 10.136.241.76 gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-sf2v <none> <none>
rediscluster-update-test-88-n2vzl 1/1 Running 0 72m 10.136.129.140 gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-z6q4 <none> <none>
rediscluster-update-test-88-qv68h 1/1 Running 0 50m 10.136.241.148 gke-aerospike-skolosov--t--nodepool-1-4475920e-rnrx <none> <none>
rediscluster-update-test-88-tg5rw 1/1 Running 0 47m 10.136.245.20 gke-aerospike-skolosov--t--nodepool-1-3ff3f802-7rqn <none> <none>
rediscluster-update-test-88-th92z 1/1 Running 0 72m 10.136.244.141 gke-aerospike-skolosov--t--nodepool-1-4475920e-zr2r <none> <none>
Primary pod: rediscluster-update-test-88-4cgf6 Secondary pod: rediscluster-update-test-88-6tq2m k8s node's, pod logs and cluster node infos are attached for these two nodes.
CPU consumption on primary node is spiky between 0.3-0.7 with rarely going up to 1.0:
root@rediscluster-update-test-88-4cgf6:/data# top
top - 22:41:48 up 2:31, 0 users, load average: 0.18, 0.16, 0.12
Tasks: 5 total, 1 running, 4 sleeping, 0 stopped, 0 zombie
%Cpu(s): 5.9 us, 2.5 sy, 0.0 ni, 89.6 id, 1.8 wa, 0.0 hi, 0.2 si, 0.0 st
MiB Mem : 7954.7 total, 926.4 free, 709.8 used, 6318.5 buff/cache
MiB Swap: 0.0 total, 0.0 free, 0.0 used. 6983.2 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
13 root 20 0 153584 15564 11988 S 0.7 0.2 0:38.75 keydb-server
1 root 20 0 754960 35712 25244 S 0.0 0.4 0:01.62 node
28 root 20 0 153584 4596 636 S 0.0 0.1 0:00.02 keydb-server
57 root 20 0 4248 3464 3020 S 0.0 0.0 0:00.00 bash
65 root 20 0 6084 3312 2816 R 0.0 0.0 0:00.00 top
CPU consumption on replica node is spiky between 0.3-0.7:
root@rediscluster-update-test-88-6tq2m:/data# top
top - 22:45:13 up 2:34, 0 users, load average: 0.15, 0.18, 0.17
Tasks: 4 total, 1 running, 3 sleeping, 0 stopped, 0 zombie
%Cpu(s): 8.1 us, 2.7 sy, 0.0 ni, 87.4 id, 1.5 wa, 0.0 hi, 0.3 si, 0.0 st
MiB Mem : 7954.7 total, 903.1 free, 733.4 used, 6318.2 buff/cache
MiB Swap: 0.0 total, 0.0 free, 0.0 used. 6972.0 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
14 root 20 0 153584 15376 11932 S 0.7 0.2 0:33.01 keydb-server
1 root 20 0 754704 37824 27516 S 0.0 0.5 0:01.60 node
39 root 20 0 4248 3376 2924 S 0.0 0.0 0:00.00 bash
47 root 20 0 6084 3280 2788 R 0.0 0.0 0:00.00 top
skolosov@C02F34L1MD6R ~ % cat ~/gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q.desc
Name: gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
Roles: <none>
Labels: beta.kubernetes.io/arch=amd64
beta.kubernetes.io/instance-type=n2d-standard-2
beta.kubernetes.io/os=linux
cloud.google.com/gke-boot-disk=pd-balanced
cloud.google.com/gke-container-runtime=containerd
cloud.google.com/gke-cpu-scaling-level=2
cloud.google.com/gke-logging-variant=DEFAULT
cloud.google.com/gke-max-pods-per-node=32
cloud.google.com/gke-nodepool=nodepool-1
cloud.google.com/gke-os-distribution=cos
cloud.google.com/gke-provisioning=standard
cloud.google.com/gke-stack-type=IPV4
cloud.google.com/machine-family=n2d
failure-domain.beta.kubernetes.io/region=us-central1
failure-domain.beta.kubernetes.io/zone=us-central1-b
kubernetes.io/arch=amd64
kubernetes.io/hostname=gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
kubernetes.io/os=linux
node.kubernetes.io/instance-type=n2d-standard-2
topology.gke.io/zone=us-central1-b
topology.kubernetes.io/region=us-central1
topology.kubernetes.io/zone=us-central1-b
Annotations: container.googleapis.com/instance_id: 6903402144618495690
csi.volume.kubernetes.io/nodeid:
{"pd.csi.storage.gke.io":"projects/skolosov-aerospike-test/zones/us-central1-b/instances/gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn...
node.alpha.kubernetes.io/ttl: 0
node.gke.io/last-applied-node-labels:
cloud.google.com/gke-boot-disk=pd-balanced,cloud.google.com/gke-container-runtime=containerd,cloud.google.com/gke-cpu-scaling-level=2,clou...
node.gke.io/last-applied-node-taints:
volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp: Tue, 19 Sep 2023 13:10:38 -0700
Taints: <none>
Unschedulable: false
Lease:
HolderIdentity: gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
AcquireTime: <unset>
RenewTime: Tue, 19 Sep 2023 15:15:51 -0700
Conditions:
Type Status LastHeartbeatTime LastTransitionTime Reason Message
---- ------ ----------------- ------------------ ------ -------
FrequentDockerRestart False Tue, 19 Sep 2023 15:10:56 -0700 Tue, 19 Sep 2023 13:10:43 -0700 NoFrequentDockerRestart docker is functioning properly
FrequentContainerdRestart False Tue, 19 Sep 2023 15:10:56 -0700 Tue, 19 Sep 2023 13:10:43 -0700 NoFrequentContainerdRestart containerd is functioning properly
FrequentUnregisterNetDevice False Tue, 19 Sep 2023 15:10:56 -0700 Tue, 19 Sep 2023 13:10:43 -0700 NoFrequentUnregisterNetDevice node is functioning properly
KernelDeadlock False Tue, 19 Sep 2023 15:10:56 -0700 Tue, 19 Sep 2023 13:10:43 -0700 KernelHasNoDeadlock kernel has no deadlock
ReadonlyFilesystem False Tue, 19 Sep 2023 15:10:56 -0700 Tue, 19 Sep 2023 13:10:43 -0700 FilesystemIsNotReadOnly Filesystem is not read-only
CorruptDockerOverlay2 False Tue, 19 Sep 2023 15:10:56 -0700 Tue, 19 Sep 2023 13:10:43 -0700 NoCorruptDockerOverlay2 docker overlay2 is functioning properly
FrequentKubeletRestart False Tue, 19 Sep 2023 15:10:56 -0700 Tue, 19 Sep 2023 13:10:43 -0700 NoFrequentKubeletRestart kubelet is functioning properly
NetworkUnavailable False Tue, 19 Sep 2023 13:10:39 -0700 Tue, 19 Sep 2023 13:10:39 -0700 RouteCreated NodeController create implicit route
MemoryPressure False Tue, 19 Sep 2023 15:14:38 -0700 Tue, 19 Sep 2023 13:10:37 -0700 KubeletHasSufficientMemory kubelet has sufficient memory available
DiskPressure False Tue, 19 Sep 2023 15:14:38 -0700 Tue, 19 Sep 2023 13:10:37 -0700 KubeletHasNoDiskPressure kubelet has no disk pressure
PIDPressure False Tue, 19 Sep 2023 15:14:38 -0700 Tue, 19 Sep 2023 13:10:37 -0700 KubeletHasSufficientPID kubelet has sufficient PID available
Ready True Tue, 19 Sep 2023 15:14:38 -0700 Tue, 19 Sep 2023 13:10:39 -0700 KubeletReady kubelet is posting ready status. AppArmor enabled
Addresses:
InternalIP: 10.110.45.32
ExternalIP: 34.16.61.87
Hostname: gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
Capacity:
cpu: 2
ephemeral-storage: 16266500Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 8145592Ki
pods: 32
Allocatable:
cpu: 1930m
ephemeral-storage: 4253788136
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 6180536Ki
pods: 32
System Info:
Machine ID: 1d49f7a789402030cda0db23e9fef3c8
System UUID: 1d49f7a7-8940-2030-cda0-db23e9fef3c8
Boot ID: 28956ffd-0c5a-4583-a872-c4c6b19ef842
Kernel Version: 5.15.107+
OS Image: Container-Optimized OS from Google
Operating System: linux
Architecture: amd64
Container Runtime Version: containerd://1.6.18
Kubelet Version: v1.25.10-gke.2700
Kube-Proxy Version: v1.25.10-gke.2700
PodCIDR: 10.136.244.0/26
PodCIDRs: 10.136.244.0/26
ProviderID: gce://skolosov-aerospike-test/us-central1-b/gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
Non-terminated Pods: (7 in total)
Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits AGE
--------- ---- ------------ ---------- --------------- ------------- ---
kube-system fluentbit-gke-hplz6 100m (5%) 0 (0%) 200Mi (3%) 500Mi (8%) 125m
kube-system gke-metrics-agent-945lg 6m (0%) 0 (0%) 100Mi (1%) 100Mi (1%) 125m
kube-system kube-proxy-gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q 100m (5%) 0 (0%) 0 (0%) 0 (0%) 125m
kube-system pdcsi-node-b4bjk 10m (0%) 0 (0%) 20Mi (0%) 100Mi (1%) 125m
mesh-system cadvisor-bjwgj 300m (15%) 900m (46%) 400Mi (6%) 1224Mi (20%) 125m
mesh-system runtime-security-monitoring-xkqtp 110m (5%) 600m (31%) 300Mi (4%) 900Mi (14%) 125m
test rediscluster-update-test-88-4cgf6 200m (10%) 1 (51%) 50Mi (0%) 50Mi (0%) 59m
Allocated resources:
(Total limits may be over 100 percent, i.e., overcommitted.)
Resource Requests Limits
-------- -------- ------
cpu 826m (42%) 2500m (129%)
memory 1070Mi (17%) 2874Mi (47%)
ephemeral-storage 0 (0%) 0 (0%)
hugepages-1Gi 0 (0%) 0 (0%)
hugepages-2Mi 0 (0%) 0 (0%)
Events: <none>
skolosov@C02F34L1MD6R ~ % cat ~/gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw.desc
Name: gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
Roles: <none>
Labels: beta.kubernetes.io/arch=amd64
beta.kubernetes.io/instance-type=n2d-standard-2
beta.kubernetes.io/os=linux
cloud.google.com/gke-boot-disk=pd-balanced
cloud.google.com/gke-container-runtime=containerd
cloud.google.com/gke-cpu-scaling-level=2
cloud.google.com/gke-logging-variant=DEFAULT
cloud.google.com/gke-max-pods-per-node=32
cloud.google.com/gke-nodepool=nodepool-1
cloud.google.com/gke-os-distribution=cos
cloud.google.com/gke-provisioning=standard
cloud.google.com/gke-stack-type=IPV4
cloud.google.com/machine-family=n2d
failure-domain.beta.kubernetes.io/region=us-central1
failure-domain.beta.kubernetes.io/zone=us-central1-c
kubernetes.io/arch=amd64
kubernetes.io/hostname=gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
kubernetes.io/os=linux
node.kubernetes.io/instance-type=n2d-standard-2
topology.gke.io/zone=us-central1-c
topology.kubernetes.io/region=us-central1
topology.kubernetes.io/zone=us-central1-c
Annotations: container.googleapis.com/instance_id: 2168972145624334031
csi.volume.kubernetes.io/nodeid:
{"pd.csi.storage.gke.io":"projects/skolosov-aerospike-test/zones/us-central1-c/instances/gke-aerospike-skolosov--t--nodepool-1-4475920e-ct...
node.alpha.kubernetes.io/ttl: 0
node.gke.io/last-applied-node-labels:
cloud.google.com/gke-boot-disk=pd-balanced,cloud.google.com/gke-container-runtime=containerd,cloud.google.com/gke-cpu-scaling-level=2,clou...
node.gke.io/last-applied-node-taints:
volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp: Tue, 19 Sep 2023 13:10:33 -0700
Taints: <none>
Unschedulable: false
Lease:
HolderIdentity: gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
AcquireTime: <unset>
RenewTime: Tue, 19 Sep 2023 15:34:37 -0700
Conditions:
Type Status LastHeartbeatTime LastTransitionTime Reason Message
---- ------ ----------------- ------------------ ------ -------
FrequentDockerRestart False Tue, 19 Sep 2023 15:30:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 NoFrequentDockerRestart docker is functioning properly
FrequentContainerdRestart False Tue, 19 Sep 2023 15:30:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 NoFrequentContainerdRestart containerd is functioning properly
CorruptDockerOverlay2 False Tue, 19 Sep 2023 15:30:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 NoCorruptDockerOverlay2 docker overlay2 is functioning properly
FrequentUnregisterNetDevice False Tue, 19 Sep 2023 15:30:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 NoFrequentUnregisterNetDevice node is functioning properly
KernelDeadlock False Tue, 19 Sep 2023 15:30:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 KernelHasNoDeadlock kernel has no deadlock
ReadonlyFilesystem False Tue, 19 Sep 2023 15:30:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 FilesystemIsNotReadOnly Filesystem is not read-only
FrequentKubeletRestart False Tue, 19 Sep 2023 15:30:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 NoFrequentKubeletRestart kubelet is functioning properly
NetworkUnavailable False Tue, 19 Sep 2023 13:10:34 -0700 Tue, 19 Sep 2023 13:10:34 -0700 RouteCreated NodeController create implicit route
MemoryPressure False Tue, 19 Sep 2023 15:29:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 KubeletHasSufficientMemory kubelet has sufficient memory available
DiskPressure False Tue, 19 Sep 2023 15:29:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 KubeletHasNoDiskPressure kubelet has no disk pressure
PIDPressure False Tue, 19 Sep 2023 15:29:51 -0700 Tue, 19 Sep 2023 13:10:33 -0700 KubeletHasSufficientPID kubelet has sufficient PID available
Ready True Tue, 19 Sep 2023 15:29:51 -0700 Tue, 19 Sep 2023 13:10:34 -0700 KubeletReady kubelet is posting ready status. AppArmor enabled
Addresses:
InternalIP: 10.110.45.30
ExternalIP: 34.30.166.105
Hostname: gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
Capacity:
cpu: 2
ephemeral-storage: 16266500Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 8145592Ki
pods: 32
Allocatable:
cpu: 1930m
ephemeral-storage: 4253788136
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 6180536Ki
pods: 32
System Info:
Machine ID: d99af6d80907f802466ca1adb662b647
System UUID: d99af6d8-0907-f802-466c-a1adb662b647
Boot ID: 6e256fbd-b4b0-483e-92e9-fcc363a39dc9
Kernel Version: 5.15.107+
OS Image: Container-Optimized OS from Google
Operating System: linux
Architecture: amd64
Container Runtime Version: containerd://1.6.18
Kubelet Version: v1.25.10-gke.2700
Kube-Proxy Version: v1.25.10-gke.2700
PodCIDR: 10.136.243.128/26
PodCIDRs: 10.136.243.128/26
ProviderID: gce://skolosov-aerospike-test/us-central1-c/gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
Non-terminated Pods: (7 in total)
Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits AGE
--------- ---- ------------ ---------- --------------- ------------- ---
kube-system fluentbit-gke-t5wzw 100m (5%) 0 (0%) 200Mi (3%) 500Mi (8%) 144m
kube-system gke-metrics-agent-c7bvf 6m (0%) 0 (0%) 100Mi (1%) 100Mi (1%) 144m
kube-system kube-proxy-gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw 100m (5%) 0 (0%) 0 (0%) 0 (0%) 144m
kube-system pdcsi-node-tq6ft 10m (0%) 0 (0%) 20Mi (0%) 100Mi (1%) 144m
mesh-system cadvisor-d2cst 300m (15%) 900m (46%) 400Mi (6%) 1224Mi (20%) 144m
mesh-system runtime-security-monitoring-7p6hm 110m (5%) 600m (31%) 300Mi (4%) 900Mi (14%) 144m
test rediscluster-update-test-88-6tq2m 200m (10%) 1 (51%) 50Mi (0%) 50Mi (0%) 79m
Allocated resources:
(Total limits may be over 100 percent, i.e., overcommitted.)
Resource Requests Limits
-------- -------- ------
cpu 826m (42%) 2500m (129%)
memory 1070Mi (17%) 2874Mi (47%)
ephemeral-storage 0 (0%) 0 (0%)
hugepages-1Gi 0 (0%) 0 (0%)
hugepages-2Mi 0 (0%) 0 (0%)
Events: <none>
I0919 21:16:50.434129 1 redisnode.go:101] Delaying server start for 5s...
Program started at: 2023-09-19 21:16:50.43341675 +0000 UTC m=+0.029030365
REDIS_VERSION=6.2.6
W0919 21:16:55.434623 1 node.go:207] Config /redis-conf/redis.conf not found
I0919 21:16:55.434949 1 node.go:242] Clearing '/redis-data' folder...
I0919 21:16:55.435158 1 redisnode.go:555] Trying test connection to 10.136.244.15:6379 till 2023-09-19 21:17:05.43514852 +0000 UTC m=+15.030762115
I0919 21:16:55.435287 1 redisnode.go:512] Listening on http://0.0.0.0:8088
E0919 21:16:55.435630 1 redisnode.go:578] Can't create client to 10.136.244.15:6379: dial tcp 10.136.244.15:6379: connect: connection refused
I0919 21:16:55.435730 1 redisnode.go:563] Retrying ping to 10.136.244.15:6379 in 0.5 seconds
13:13:C 19 Sep 2023 21:16:55.449 # oO0OoO0OoO0Oo KeyDB is starting oO0OoO0OoO0Oo
13:13:C 19 Sep 2023 21:16:55.449 # KeyDB version=255.255.255, bits=64, commit=9251f1c2, modified=1, pid=13, just started
13:13:C 19 Sep 2023 21:16:55.449 # Configuration loaded
13:13:M 19 Sep 2023 21:16:55.450 * monotonic clock: POSIX clock_gettime
13:13:M 19 Sep 2023 21:16:55.450 * No cluster configuration found, I'm a0cadd9d591882282267cd556498a9f1f4368a5b
13:13:M 19 Sep 2023 21:16:55.453 * Running mode=cluster, port=6379.
13:13:M 19 Sep 2023 21:16:55.453 # Server initialized
13:21:M 19 Sep 2023 21:16:55.454 * Thread 0 alive.
13:22:M 19 Sep 2023 21:16:55.455 * Thread 1 alive.
I0919 21:16:55.937823 1 redisnode.go:555] Trying test connection to 10.136.244.15:6379 till 2023-09-19 21:17:05.43514852 +0000 UTC m=+15.030762115
I0919 21:16:55.938481 1 redisnode.go:590] Successfully pinged local server at 10.136.244.15:6379.
I0919 21:16:56.449983 1 redisnode.go:227] REDIS Node addresses:10.136.129.140:6379
I0919 21:16:56.450019 1 redisnode.go:227] REDIS Node addresses:10.136.241.13:6379
I0919 21:16:56.450025 1 redisnode.go:227] REDIS Node addresses:10.136.241.14:6379
I0919 21:16:56.450030 1 redisnode.go:227] REDIS Node addresses:10.136.241.216:6379
I0919 21:16:56.450035 1 redisnode.go:227] REDIS Node addresses:10.136.241.76:6379
I0919 21:16:56.450039 1 redisnode.go:227] REDIS Node addresses:10.136.243.12:6379
I0919 21:16:56.450044 1 redisnode.go:227] REDIS Node addresses:10.136.243.146:6379
I0919 21:16:56.450050 1 redisnode.go:227] REDIS Node addresses:10.136.244.141:6379
I0919 21:16:56.450057 1 redisnode.go:227] REDIS Node addresses:10.136.244.202:6379
I0919 21:16:56.450061 1 redisnode.go:227] REDIS Node addresses:10.136.244.79:6379
I0919 21:16:56.468318 1 redisnode.go:200] Attaching node 10.136.244.15:6379 to the cluster
I0919 21:16:56.468350 1 admin.go:213] Attaching node 10.136.244.15:6379 to the cluster.
I0919 21:16:56.469656 1 admin.go:246] node 10.136.244.15:6379 attached properly
I0919 21:16:56.469715 1 redisnode.go:213] RedisNode: Running properly
I0919 21:16:56.469746 1 redisnode.go:122] Awaiting stop signal
13:21:M 19 Sep 2023 21:16:56.470 # IP address for this node updated to 10.136.244.15
13:21:M 19 Sep 2023 21:16:59.493 # Cluster state changed: ok
I0919 21:17:14.716124 1 redisnode.go:439] Readiness probe ok
I0919 21:17:14.716517 1 redisnode.go:457] Liveness probe ok
13:22:M 19 Sep 2023 21:17:40.900 * Replica 10.136.129.145:6379 asks for synchronization
13:22:M 19 Sep 2023 21:17:40.900 * Partial resynchronization not accepted: Replication ID mismatch (Replica asked for '59198a1f3c7c39efe03d0de2cf64af6b2abdde26', my replication IDs are '60fddad9b8c101d756f06341b2d02ab3d036629f' and '0000000000000000000000000000000000000000')
13:22:M 19 Sep 2023 21:17:40.900 * Replication backlog created, my new replication IDs are '6259fa4c9af68a358fd7d4f20804f33b84d847e8' and '0000000000000000000000000000000000000000'
13:22:M 19 Sep 2023 21:17:40.900 * Starting BGSAVE for SYNC with target: disk
13:22:M 19 Sep 2023 21:17:40.918 * Background saving started by pid 28
13:22:M 19 Sep 2023 21:17:40.918 * Background saving started
13:21:M 19 Sep 2023 21:17:40.921 * Replica 10.136.243.146:6379 asks for synchronization
13:21:M 19 Sep 2023 21:17:40.922 * Partial resynchronization not accepted: Replication ID mismatch (Replica asked for 'b4b77bc2319ace4bd216d05397828e6c9a510cde', my replication IDs are '6259fa4c9af68a358fd7d4f20804f33b84d847e8' and '0000000000000000000000000000000000000000')
13:21:M 19 Sep 2023 21:17:40.922 * Waiting for end of BGSAVE for SYNC
13:21:M 19 Sep 2023 21:20:46.513 # Failover auth granted to 3552bbd8ed39dc1322dcfd43827fc2d65f1729db for epoch 13
13:21:M 19 Sep 2023 21:23:03.417 # Failover auth granted to 483c38ddb492e0c84a3d21a6b66925dbecb58f4c for epoch 14
13:21:M 19 Sep 2023 21:25:11.462 # Failover auth granted to f30037c4afef7a5ba19c810b1b4768bc8946b53d for epoch 15
13:21:M 19 Sep 2023 21:25:29.705 # Failover auth granted to 483c38ddb492e0c84a3d21a6b66925dbecb58f4c for epoch 16
13:21:M 19 Sep 2023 21:29:13.016 * FAIL message received from 65da31685e82c575774b7d3345df79c004658bbe about f30037c4afef7a5ba19c810b1b4768bc8946b53d
skolosov@C02F34L1MD6R ~ % cat ~/rediscluster-update-test-88-6tq2m.logs
Program started at: 2023-09-19 21:15:48.051282077 +0000 UTC m=+0.019104310
REDIS_VERSION=6.2.6
I0919 21:15:48.051777 1 redisnode.go:101] Delaying server start for 5s...
W0919 21:15:53.052298 1 node.go:207] Config /redis-conf/redis.conf not found
I0919 21:15:53.052614 1 node.go:242] Clearing '/redis-data' folder...
I0919 21:15:53.052780 1 redisnode.go:555] Trying test connection to 10.136.243.146:6379 till 2023-09-19 21:16:03.052760049 +0000 UTC m=+15.020582262
I0919 21:15:53.052948 1 redisnode.go:512] Listening on http://0.0.0.0:8088
E0919 21:15:53.053547 1 redisnode.go:578] Can't create client to 10.136.243.146:6379: dial tcp 10.136.243.146:6379: connect: connection refused
I0919 21:15:53.053565 1 redisnode.go:563] Retrying ping to 10.136.243.146:6379 in 0.5 seconds
14:14:C 19 Sep 2023 21:15:53.067 # oO0OoO0OoO0Oo KeyDB is starting oO0OoO0OoO0Oo
14:14:C 19 Sep 2023 21:15:53.067 # KeyDB version=255.255.255, bits=64, commit=9251f1c2, modified=1, pid=14, just started
14:14:C 19 Sep 2023 21:15:53.067 # Configuration loaded
14:14:M 19 Sep 2023 21:15:53.068 * monotonic clock: POSIX clock_gettime
14:14:M 19 Sep 2023 21:15:53.068 * No cluster configuration found, I'm 1788d52122f37bbb5ccad866dea82be37dee36ea
14:14:M 19 Sep 2023 21:15:53.071 * Running mode=cluster, port=6379.
14:14:M 19 Sep 2023 21:15:53.071 # Server initialized
14:23:M 19 Sep 2023 21:15:53.072 * Thread 0 alive.
14:24:M 19 Sep 2023 21:15:53.072 * Thread 1 alive.
I0919 21:15:53.554156 1 redisnode.go:555] Trying test connection to 10.136.243.146:6379 till 2023-09-19 21:16:03.052760049 +0000 UTC m=+15.020582262
I0919 21:15:53.554905 1 redisnode.go:590] Successfully pinged local server at 10.136.243.146:6379.
I0919 21:15:54.065217 1 redisnode.go:227] REDIS Node addresses:10.136.129.140:6379
I0919 21:15:54.065250 1 redisnode.go:227] REDIS Node addresses:10.136.241.13:6379
I0919 21:15:54.065256 1 redisnode.go:227] REDIS Node addresses:10.136.241.14:6379
I0919 21:15:54.065262 1 redisnode.go:227] REDIS Node addresses:10.136.241.216:6379
I0919 21:15:54.065267 1 redisnode.go:227] REDIS Node addresses:10.136.241.76:6379
I0919 21:15:54.065272 1 redisnode.go:227] REDIS Node addresses:10.136.243.12:6379
I0919 21:15:54.065277 1 redisnode.go:227] REDIS Node addresses:10.136.244.141:6379
I0919 21:15:54.065282 1 redisnode.go:227] REDIS Node addresses:10.136.244.202:6379
I0919 21:15:54.065286 1 redisnode.go:227] REDIS Node addresses:10.136.244.79:6379
I0919 21:15:54.078654 1 redisnode.go:200] Attaching node 10.136.243.146:6379 to the cluster
I0919 21:15:54.078683 1 admin.go:213] Attaching node 10.136.243.146:6379 to the cluster.
I0919 21:15:54.080145 1 admin.go:246] node 10.136.243.146:6379 attached properly
I0919 21:15:54.080197 1 redisnode.go:213] RedisNode: Running properly
I0919 21:15:54.080217 1 redisnode.go:122] Awaiting stop signal
14:23:M 19 Sep 2023 21:15:54.096 # IP address for this node updated to 10.136.243.146
14:23:M 19 Sep 2023 21:15:57.102 # Cluster state changed: ok
I0919 21:16:12.367623 1 redisnode.go:457] Liveness probe ok
I0919 21:16:12.368624 1 redisnode.go:439] Readiness probe ok
14:23:S 19 Sep 2023 21:17:40.897 * Before turning into a replica, using my own master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer.
14:23:S 19 Sep 2023 21:17:40.898 * Connecting to MASTER 10.136.244.15:6379
14:23:S 19 Sep 2023 21:17:40.898 * MASTER <-> REPLICA sync started
14:23:S 19 Sep 2023 21:17:40.898 * Non blocking connect for SYNC fired the event.
14:23:S 19 Sep 2023 21:17:40.901 * Master replied to PING, replication can continue...
14:23:S 19 Sep 2023 21:17:40.921 * Trying a partial resynchronization (request b4b77bc2319ace4bd216d05397828e6c9a510cde:1).
14:23:S 19 Sep 2023 21:17:40.922 * Full resync from master: 6259fa4c9af68a358fd7d4f20804f33b84d847e8:0
14:23:S 19 Sep 2023 21:17:40.922 * Discarding previously cached master state.
NOTICE: Detuning locks due to high load per core: 109.23%
NOTICE: Detuning locks due to high load per core: 95.90%
NOTICE: Detuning locks due to high load per core: 101.46%
14:23:S 19 Sep 2023 21:29:13.016 * FAIL message received from 65da31685e82c575774b7d3345df79c004658bbe about f30037c4afef7a5ba19c810b1b4768bc8946b53d
skolosov@C02F34L1MD6R ~ % cat ~/rediscluster-update-test-88-6tq2m.info
# Server
redis_version:255.255.255
redis_git_sha1:9251f1c2
redis_git_dirty:1
redis_build_id:b2dd10e11e590621
redis_mode:cluster
os:Linux 5.15.107+ x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:9.4.0
process_id:14
process_supervised:no
run_id:f4af81473588adb1f1a1ad868ef256ef34fb20d6
tcp_port:6379
server_time_usec:1695161887678526
uptime_in_seconds:3734
uptime_in_days:0
hz:10
configured_hz:10
lru_clock:663071
executable:/data/keydb-server
config_file:/redis-conf/redis.conf
availability_zone:us-central1-c
features:cluster_mget
# Clients
connected_clients:1
cluster_connections:26
maxclients:1000
client_recent_max_input_buffer:0
client_recent_max_output_buffer:0
blocked_clients:0
tracking_clients:0
clients_in_timeout_table:0
current_client_thread:0
thread_0_clients:1
thread_1_clients:0
# Memory
used_memory:1505368
used_memory_human:1.44M
used_memory_rss:15998976
used_memory_rss_human:15.26M
used_memory_peak:1648056
used_memory_peak_human:1.57M
used_memory_peak_perc:91.34%
used_memory_overhead:1237256
used_memory_startup:1237256
used_memory_dataset:268112
used_memory_dataset_perc:100.00%
allocator_allocated:2440600
allocator_active:3211264
allocator_resident:5980160
total_system_memory:8341086208
total_system_memory_human:7.77G
used_memory_lua:37888
used_memory_lua_human:37.00K
used_memory_scripts:0
used_memory_scripts_human:0B
number_of_cached_scripts:0
maxmemory:20000000
maxmemory_human:19.07M
maxmemory_policy:noeviction
allocator_frag_ratio:1.32
allocator_frag_bytes:770664
allocator_rss_ratio:1.86
allocator_rss_bytes:2768896
rss_overhead_ratio:2.68
rss_overhead_bytes:10018816
mem_fragmentation_ratio:11.10
mem_fragmentation_bytes:14557400
mem_not_counted_for_evict:0
mem_replication_backlog:0
mem_clients_slaves:0
mem_clients_normal:0
mem_aof_buffer:0
mem_allocator:jemalloc-5.2.1
active_defrag_running:0
lazyfree_pending_objects:0
lazyfreed_objects:0
storage_provider:none
available_system_memory:unavailable
# Persistence
loading:0
current_cow_size:0
current_cow_size_age:0
current_fork_perc:0.00
current_save_keys_processed:0
current_save_keys_total:0
rdb_changes_since_last_save:0
rdb_bgsave_in_progress:0
rdb_last_save_time:1695158153
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:-1
rdb_current_bgsave_time_sec:-1
rdb_last_cow_size:0
aof_enabled:0
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:-1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:0
module_fork_in_progress:0
module_fork_last_cow_size:0
# Stats
total_connections_received:2453
total_commands_processed:1572
instantaneous_ops_per_sec:0
total_net_input_bytes:46154
total_net_output_bytes:3143735
instantaneous_input_kbps:0.02
instantaneous_output_kbps:0.93
rejected_connections:0
sync_full:0
sync_partial_ok:0
sync_partial_err:0
expired_keys:0
expired_stale_perc:0.00
expired_time_cap_reached_count:0
expire_cycle_cpu_milliseconds:1
evicted_keys:0
keyspace_hits:0
keyspace_misses:0
pubsub_channels:0
pubsub_patterns:0
latest_fork_usec:0
total_forks:0
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
tracking_total_keys:0
tracking_total_items:0
tracking_total_prefixes:0
unexpected_error_replies:0
total_error_replies:0
dump_payload_sanitizations:0
total_reads_processed:4025
total_writes_processed:1572
instantaneous_lock_contention:1
avg_lock_contention:0.390625
storage_provider_read_hits:0
storage_provider_read_misses:0
# Replication
role:slave
master_global_link_status:down
connected_masters:0
master_host:10.136.244.15
master_port:6379
master_link_status:down
master_last_io_seconds_ago:-1
master_sync_in_progress:1
slave_read_repl_offset:1
slave_repl_offset:1
master_sync_total_bytes:-1
master_sync_read_bytes:0
master_sync_left_bytes:-1
master_sync_perc:-0.00
master_sync_last_io_seconds_ago:0
master_link_down_since_seconds:-1
slave_priority:100
slave_read_only:1
replica_announced:1
connected_slaves:0
master_failover_state:no-failover
master_replid:b4b77bc2319ace4bd216d05397828e6c9a510cde
master_replid2:0000000000000000000000000000000000000000
master_repl_offset:0
second_repl_offset:-1
repl_backlog_active:0
repl_backlog_size:1000000
repl_backlog_first_byte_offset:0
repl_backlog_histlen:0
# CPU
used_cpu_sys:11.910977
used_cpu_user:11.529626
used_cpu_sys_children:0.000000
used_cpu_user_children:0.000000
server_threads:2
long_lock_waits:0
used_cpu_sys_main_thread:8.090785
used_cpu_user_main_thread:7.544344
# Modules
# Errorstats
# Cluster
cluster_enabled:1
# Keyspace
# KeyDB
mvcc_depth:0
skolosov@C02F34L1MD6R ~ % cat ~/rediscluster-update-test-88-4cgf6.info
# Server
redis_version:255.255.255
redis_git_sha1:9251f1c2
redis_git_dirty:1
redis_build_id:b2dd10e11e590621
redis_mode:cluster
os:Linux 5.15.107+ x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:9.4.0
process_id:13
process_supervised:no
run_id:768c779b42761eadd13ee4750ea4567edf89a042
tcp_port:6379
server_time_usec:1695161910951564
uptime_in_seconds:3695
uptime_in_days:0
hz:10
configured_hz:10
lru_clock:663094
executable:/data/keydb-server
config_file:/redis-conf/redis.conf
availability_zone:us-central1-b
features:cluster_mget
# Clients
connected_clients:1
cluster_connections:26
maxclients:1000
client_recent_max_input_buffer:32
client_recent_max_output_buffer:0
blocked_clients:0
tracking_clients:0
clients_in_timeout_table:0
current_client_thread:1
thread_0_clients:1
thread_1_clients:2
# Memory
used_memory:2635976
used_memory_human:2.51M
used_memory_rss:16084992
used_memory_rss_human:15.34M
used_memory_peak:2779856
used_memory_peak_human:2.65M
used_memory_peak_perc:94.82%
used_memory_overhead:2326856
used_memory_startup:1237256
used_memory_dataset:309120
used_memory_dataset_perc:22.10%
allocator_allocated:3574288
allocator_active:4464640
allocator_resident:7233536
total_system_memory:8341086208
total_system_memory_human:7.77G
used_memory_lua:37888
used_memory_lua_human:37.00K
used_memory_scripts:0
used_memory_scripts_human:0B
number_of_cached_scripts:0
maxmemory:20000000
maxmemory_human:19.07M
maxmemory_policy:noeviction
allocator_frag_ratio:1.25
allocator_frag_bytes:890352
allocator_rss_ratio:1.62
allocator_rss_bytes:2768896
rss_overhead_ratio:2.22
rss_overhead_bytes:8851456
mem_fragmentation_ratio:6.25
mem_fragmentation_bytes:13512848
mem_not_counted_for_evict:0
mem_replication_backlog:1048576
mem_clients_slaves:41024
mem_clients_normal:0
mem_aof_buffer:0
mem_allocator:jemalloc-5.2.1
active_defrag_running:0
lazyfree_pending_objects:0
lazyfreed_objects:0
storage_provider:none
available_system_memory:unavailable
# Persistence
loading:0
current_cow_size:0
current_cow_size_age:0
current_fork_perc:0.00
current_save_keys_processed:0
current_save_keys_total:0
rdb_changes_since_last_save:0
rdb_bgsave_in_progress:1
rdb_last_save_time:1695158215
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:-1
rdb_current_bgsave_time_sec:3650
rdb_last_cow_size:0
aof_enabled:0
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:-1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:0
module_fork_in_progress:0
module_fork_last_cow_size:0
# Stats
total_connections_received:2430
total_commands_processed:9761
instantaneous_ops_per_sec:0
total_net_input_bytes:865774
total_net_output_bytes:3124035
instantaneous_input_kbps:0.02
instantaneous_output_kbps:0.92
rejected_connections:0
sync_full:2
sync_partial_ok:0
sync_partial_err:2
expired_keys:0
expired_stale_perc:0.00
expired_time_cap_reached_count:0
expire_cycle_cpu_milliseconds:48
evicted_keys:0
keyspace_hits:0
keyspace_misses:0
pubsub_channels:0
pubsub_patterns:0
latest_fork_usec:17424
total_forks:1
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
tracking_total_keys:0
tracking_total_items:0
tracking_total_prefixes:0
unexpected_error_replies:0
total_error_replies:0
dump_payload_sanitizations:0
total_reads_processed:12187
total_writes_processed:9757
instantaneous_lock_contention:1
avg_lock_contention:0.484375
storage_provider_read_hits:0
storage_provider_read_misses:0
# Replication
role:master
connected_slaves:2
slave0:ip=10.136.129.145,port=6379,state=wait_bgsave,offset=0,lag=0
slave1:ip=10.136.243.146,port=6379,state=wait_bgsave,offset=0,lag=0
master_failover_state:no-failover
master_replid:6259fa4c9af68a358fd7d4f20804f33b84d847e8
master_replid2:0000000000000000000000000000000000000000
master_repl_offset:5105
second_repl_offset:-1
repl_backlog_active:1
repl_backlog_size:1000000
repl_backlog_first_byte_offset:1
repl_backlog_histlen:5105
# CPU
used_cpu_sys:14.675105
used_cpu_user:15.011243
used_cpu_sys_children:0.000000
used_cpu_user_children:0.000000
server_threads:2
long_lock_waits:0
used_cpu_sys_main_thread:1.864041
used_cpu_user_main_thread:2.880313
# Modules
# Errorstats
# Cluster
cluster_enabled:1
# Keyspace
# KeyDB
mvcc_depth:0