KeyDB icon indicating copy to clipboard operation
KeyDB copied to clipboard

[BUG] Two replicas of one shard never finish syncing.

Open skolosov-snap opened this issue 2 years ago • 6 comments

Describe the bug Two replicas of one shard never finish syncing:

# Replication
role:master
connected_slaves:2
slave0:ip=10.136.129.145,port=6379,state=wait_bgsave,offset=0,lag=0
slave1:ip=10.136.243.146,port=6379,state=wait_bgsave,offset=0,lag=0

To reproduce

create cluster and start re-attaching replicas and runs failovers manually.

Expected behavior After topology changes, replicas should complete syncing from primary node.

Additional information

cluster nodes:

skolosov@C02F34L1MD6R caching-infra % kubectl exec -it rediscluster-update-test-88-6tq2m -n test -c redis-node -- keydb-cli cluster nodes
f30037c4afef7a5ba19c810b1b4768bc8946b53d 10.136.241.13:6379@16379 master,fail - 1695158950093 1695158949088 15 connected
b45c4bed6b978e18f0340851c5af69e9cd2650c1 10.136.244.141:6379@16379 slave 3552bbd8ed39dc1322dcfd43827fc2d65f1729db 0 1695161717036 13 connected
3552bbd8ed39dc1322dcfd43827fc2d65f1729db 10.136.240.211:6379@16379 master - 0 1695161717136 13 connected 0-4095
e8fdb8f41cbc4452f5d1fba78fd48c7b0ff1dd3a 10.136.241.76:6379@16379 slave 3552bbd8ed39dc1322dcfd43827fc2d65f1729db 0 1695161717000 13 connected
65da31685e82c575774b7d3345df79c004658bbe 10.136.241.216:6379@16379 slave 483c38ddb492e0c84a3d21a6b66925dbecb58f4c 0 1695161717036 16 connected
a0cadd9d591882282267cd556498a9f1f4368a5b 10.136.244.15:6379@16379 master - 0 1695161717036 12 connected 4096-5461 9558-10923 15020-16383
483c38ddb492e0c84a3d21a6b66925dbecb58f4c 10.136.241.148:6379@16379 master - 0 1695161717036 16 connected 5462-9557
bcb9878aaf5cb6a589b36177102e9f04b6463faa 10.136.129.145:6379@16379 slave a0cadd9d591882282267cd556498a9f1f4368a5b 0 1695161717136 12 connected
1788d52122f37bbb5ccad866dea82be37dee36ea 10.136.243.146:6379@16379 myself,slave a0cadd9d591882282267cd556498a9f1f4368a5b 0 1695161716000 12 connected
9301a11909678d54db209471cfef80dd25b35843 10.136.241.18:6379@16379 master - 0 1695161717136 0 connected
216f289dd37ae8357478b0e3ede4dc28c9b8f8e2 10.136.242.215:6379@16379 slave f0f832c666bf5cc257cba8b0c80c5e0ffa78233f 0 1695161716935 10 connected
f0f832c666bf5cc257cba8b0c80c5e0ffa78233f 10.136.244.79:6379@16379 master - 0 1695161717136 10 connected 10924-15019
42cabc99bb858b19ce43e7b0f9b8661d16318ac1 10.136.245.20:6379@16379 slave 483c38ddb492e0c84a3d21a6b66925dbecb58f4c 0 1695161717000 16 connected
3e7ab941ba58af914e09b2e3bef95efa0adc0909 10.136.129.140:6379@16379 slave f0f832c666bf5cc257cba8b0c80c5e0ffa78233f 0 1695161717036 10 connected

list of K8s pods:

skolosov@C02F34L1MD6R caching-infra % kubectl get pods -n test -o wide
NAME                                READY   STATUS    RESTARTS   AGE   IP               NODE                                                  NOMINATED NODE   READINESS GATES
keydb-operator-64f968bd45-pqpk2     4/4     Running   0          6h    10.136.242.172   gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-tbds   <none>           <none>
rediscluster-update-test-88-2gtxb   1/1     Running   0          72m   10.136.241.216   gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-xzxq   <none>           <none>
rediscluster-update-test-88-4cgf6   1/1     Running   0          55m   10.136.244.15    gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q   <none>           <none>
rediscluster-update-test-88-6tq2m   1/1     Running   0          56m   10.136.243.146   gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw   <none>           <none>
rediscluster-update-test-88-7bnfj   1/1     Running   0          52m   10.136.240.211   gke-aerospike-skolosov--t--nodepool-1-3ff3f802-dq85   <none>           <none>
rediscluster-update-test-88-88mh5   1/1     Running   0          43m   10.136.241.18    gke-aerospike-skolosov--t--nodepool-1-3ff3f802-z7gq   <none>           <none>
rediscluster-update-test-88-hc5sl   1/1     Running   0          72m   10.136.244.79    gke-aerospike-skolosov--t--nodepool-1-4475920e-d92j   <none>           <none>
rediscluster-update-test-88-jx7q2   1/1     Running   0          45m   10.136.242.215   gke-aerospike-skolosov--t--nodepool-1-3ff3f802-6nrs   <none>           <none>
rediscluster-update-test-88-mv7bw   1/1     Running   0          55m   10.136.129.145   gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-z6q4   <none>           <none>
rediscluster-update-test-88-mv7g4   1/1     Running   0          72m   10.136.241.76    gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-sf2v   <none>           <none>
rediscluster-update-test-88-n2vzl   1/1     Running   0          72m   10.136.129.140   gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-z6q4   <none>           <none>
rediscluster-update-test-88-qv68h   1/1     Running   0          50m   10.136.241.148   gke-aerospike-skolosov--t--nodepool-1-4475920e-rnrx   <none>           <none>
rediscluster-update-test-88-tg5rw   1/1     Running   0          47m   10.136.245.20    gke-aerospike-skolosov--t--nodepool-1-3ff3f802-7rqn   <none>           <none>
rediscluster-update-test-88-th92z   1/1     Running   0          72m   10.136.244.141   gke-aerospike-skolosov--t--nodepool-1-4475920e-zr2r   <none>           <none>

Primary pod: rediscluster-update-test-88-4cgf6 Secondary pod: rediscluster-update-test-88-6tq2m k8s node's, pod logs and cluster node infos are attached for these two nodes.

CPU consumption on primary node is spiky between 0.3-0.7 with rarely going up to 1.0:

root@rediscluster-update-test-88-4cgf6:/data# top
top - 22:41:48 up  2:31,  0 users,  load average: 0.18, 0.16, 0.12
Tasks:   5 total,   1 running,   4 sleeping,   0 stopped,   0 zombie
%Cpu(s):  5.9 us,  2.5 sy,  0.0 ni, 89.6 id,  1.8 wa,  0.0 hi,  0.2 si,  0.0 st
MiB Mem :   7954.7 total,    926.4 free,    709.8 used,   6318.5 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.   6983.2 avail Mem

    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
     13 root      20   0  153584  15564  11988 S   0.7   0.2   0:38.75 keydb-server
      1 root      20   0  754960  35712  25244 S   0.0   0.4   0:01.62 node
     28 root      20   0  153584   4596    636 S   0.0   0.1   0:00.02 keydb-server
     57 root      20   0    4248   3464   3020 S   0.0   0.0   0:00.00 bash
     65 root      20   0    6084   3312   2816 R   0.0   0.0   0:00.00 top

CPU consumption on replica node is spiky between 0.3-0.7:

root@rediscluster-update-test-88-6tq2m:/data# top
top - 22:45:13 up  2:34,  0 users,  load average: 0.15, 0.18, 0.17
Tasks:   4 total,   1 running,   3 sleeping,   0 stopped,   0 zombie
%Cpu(s):  8.1 us,  2.7 sy,  0.0 ni, 87.4 id,  1.5 wa,  0.0 hi,  0.3 si,  0.0 st
MiB Mem :   7954.7 total,    903.1 free,    733.4 used,   6318.2 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.   6972.0 avail Mem

    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
     14 root      20   0  153584  15376  11932 S   0.7   0.2   0:33.01 keydb-server
      1 root      20   0  754704  37824  27516 S   0.0   0.5   0:01.60 node
     39 root      20   0    4248   3376   2924 S   0.0   0.0   0:00.00 bash
     47 root      20   0    6084   3280   2788 R   0.0   0.0   0:00.00 top

skolosov-snap avatar Sep 19 '23 22:09 skolosov-snap

skolosov@C02F34L1MD6R ~ % cat ~/gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q.desc
Name:               gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
Roles:              <none>
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/instance-type=n2d-standard-2
                    beta.kubernetes.io/os=linux
                    cloud.google.com/gke-boot-disk=pd-balanced
                    cloud.google.com/gke-container-runtime=containerd
                    cloud.google.com/gke-cpu-scaling-level=2
                    cloud.google.com/gke-logging-variant=DEFAULT
                    cloud.google.com/gke-max-pods-per-node=32
                    cloud.google.com/gke-nodepool=nodepool-1
                    cloud.google.com/gke-os-distribution=cos
                    cloud.google.com/gke-provisioning=standard
                    cloud.google.com/gke-stack-type=IPV4
                    cloud.google.com/machine-family=n2d
                    failure-domain.beta.kubernetes.io/region=us-central1
                    failure-domain.beta.kubernetes.io/zone=us-central1-b
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
                    kubernetes.io/os=linux
                    node.kubernetes.io/instance-type=n2d-standard-2
                    topology.gke.io/zone=us-central1-b
                    topology.kubernetes.io/region=us-central1
                    topology.kubernetes.io/zone=us-central1-b
Annotations:        container.googleapis.com/instance_id: 6903402144618495690
                    csi.volume.kubernetes.io/nodeid:
                      {"pd.csi.storage.gke.io":"projects/skolosov-aerospike-test/zones/us-central1-b/instances/gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn...
                    node.alpha.kubernetes.io/ttl: 0
                    node.gke.io/last-applied-node-labels:
                      cloud.google.com/gke-boot-disk=pd-balanced,cloud.google.com/gke-container-runtime=containerd,cloud.google.com/gke-cpu-scaling-level=2,clou...
                    node.gke.io/last-applied-node-taints:
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Tue, 19 Sep 2023 13:10:38 -0700
Taints:             <none>
Unschedulable:      false
Lease:
  HolderIdentity:  gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
  AcquireTime:     <unset>
  RenewTime:       Tue, 19 Sep 2023 15:15:51 -0700
Conditions:
  Type                          Status  LastHeartbeatTime                 LastTransitionTime                Reason                          Message
  ----                          ------  -----------------                 ------------------                ------                          -------
  FrequentDockerRestart         False   Tue, 19 Sep 2023 15:10:56 -0700   Tue, 19 Sep 2023 13:10:43 -0700   NoFrequentDockerRestart         docker is functioning properly
  FrequentContainerdRestart     False   Tue, 19 Sep 2023 15:10:56 -0700   Tue, 19 Sep 2023 13:10:43 -0700   NoFrequentContainerdRestart     containerd is functioning properly
  FrequentUnregisterNetDevice   False   Tue, 19 Sep 2023 15:10:56 -0700   Tue, 19 Sep 2023 13:10:43 -0700   NoFrequentUnregisterNetDevice   node is functioning properly
  KernelDeadlock                False   Tue, 19 Sep 2023 15:10:56 -0700   Tue, 19 Sep 2023 13:10:43 -0700   KernelHasNoDeadlock             kernel has no deadlock
  ReadonlyFilesystem            False   Tue, 19 Sep 2023 15:10:56 -0700   Tue, 19 Sep 2023 13:10:43 -0700   FilesystemIsNotReadOnly         Filesystem is not read-only
  CorruptDockerOverlay2         False   Tue, 19 Sep 2023 15:10:56 -0700   Tue, 19 Sep 2023 13:10:43 -0700   NoCorruptDockerOverlay2         docker overlay2 is functioning properly
  FrequentKubeletRestart        False   Tue, 19 Sep 2023 15:10:56 -0700   Tue, 19 Sep 2023 13:10:43 -0700   NoFrequentKubeletRestart        kubelet is functioning properly
  NetworkUnavailable            False   Tue, 19 Sep 2023 13:10:39 -0700   Tue, 19 Sep 2023 13:10:39 -0700   RouteCreated                    NodeController create implicit route
  MemoryPressure                False   Tue, 19 Sep 2023 15:14:38 -0700   Tue, 19 Sep 2023 13:10:37 -0700   KubeletHasSufficientMemory      kubelet has sufficient memory available
  DiskPressure                  False   Tue, 19 Sep 2023 15:14:38 -0700   Tue, 19 Sep 2023 13:10:37 -0700   KubeletHasNoDiskPressure        kubelet has no disk pressure
  PIDPressure                   False   Tue, 19 Sep 2023 15:14:38 -0700   Tue, 19 Sep 2023 13:10:37 -0700   KubeletHasSufficientPID         kubelet has sufficient PID available
  Ready                         True    Tue, 19 Sep 2023 15:14:38 -0700   Tue, 19 Sep 2023 13:10:39 -0700   KubeletReady                    kubelet is posting ready status. AppArmor enabled
Addresses:
  InternalIP:  10.110.45.32
  ExternalIP:  34.16.61.87
  Hostname:    gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
Capacity:
  cpu:                2
  ephemeral-storage:  16266500Ki
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             8145592Ki
  pods:               32
Allocatable:
  cpu:                1930m
  ephemeral-storage:  4253788136
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             6180536Ki
  pods:               32
System Info:
  Machine ID:                 1d49f7a789402030cda0db23e9fef3c8
  System UUID:                1d49f7a7-8940-2030-cda0-db23e9fef3c8
  Boot ID:                    28956ffd-0c5a-4583-a872-c4c6b19ef842
  Kernel Version:             5.15.107+
  OS Image:                   Container-Optimized OS from Google
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  containerd://1.6.18
  Kubelet Version:            v1.25.10-gke.2700
  Kube-Proxy Version:         v1.25.10-gke.2700
PodCIDR:                      10.136.244.0/26
PodCIDRs:                     10.136.244.0/26
ProviderID:                   gce://skolosov-aerospike-test/us-central1-b/gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q
Non-terminated Pods:          (7 in total)
  Namespace                   Name                                                              CPU Requests  CPU Limits  Memory Requests  Memory Limits  AGE
  ---------                   ----                                                              ------------  ----------  ---------------  -------------  ---
  kube-system                 fluentbit-gke-hplz6                                               100m (5%)     0 (0%)      200Mi (3%)       500Mi (8%)     125m
  kube-system                 gke-metrics-agent-945lg                                           6m (0%)       0 (0%)      100Mi (1%)       100Mi (1%)     125m
  kube-system                 kube-proxy-gke-aerospike-skolosov--t--nodepool-1-55bc5dbf-qn4q    100m (5%)     0 (0%)      0 (0%)           0 (0%)         125m
  kube-system                 pdcsi-node-b4bjk                                                  10m (0%)      0 (0%)      20Mi (0%)        100Mi (1%)     125m
  mesh-system                 cadvisor-bjwgj                                                    300m (15%)    900m (46%)  400Mi (6%)       1224Mi (20%)   125m
  mesh-system                 runtime-security-monitoring-xkqtp                                 110m (5%)     600m (31%)  300Mi (4%)       900Mi (14%)    125m
  test                        rediscluster-update-test-88-4cgf6                                 200m (10%)    1 (51%)     50Mi (0%)        50Mi (0%)      59m
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests      Limits
  --------           --------      ------
  cpu                826m (42%)    2500m (129%)
  memory             1070Mi (17%)  2874Mi (47%)
  ephemeral-storage  0 (0%)        0 (0%)
  hugepages-1Gi      0 (0%)        0 (0%)
  hugepages-2Mi      0 (0%)        0 (0%)
Events:              <none>

skolosov-snap avatar Sep 19 '23 22:09 skolosov-snap

skolosov@C02F34L1MD6R ~ % cat ~/gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw.desc
Name:               gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
Roles:              <none>
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/instance-type=n2d-standard-2
                    beta.kubernetes.io/os=linux
                    cloud.google.com/gke-boot-disk=pd-balanced
                    cloud.google.com/gke-container-runtime=containerd
                    cloud.google.com/gke-cpu-scaling-level=2
                    cloud.google.com/gke-logging-variant=DEFAULT
                    cloud.google.com/gke-max-pods-per-node=32
                    cloud.google.com/gke-nodepool=nodepool-1
                    cloud.google.com/gke-os-distribution=cos
                    cloud.google.com/gke-provisioning=standard
                    cloud.google.com/gke-stack-type=IPV4
                    cloud.google.com/machine-family=n2d
                    failure-domain.beta.kubernetes.io/region=us-central1
                    failure-domain.beta.kubernetes.io/zone=us-central1-c
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
                    kubernetes.io/os=linux
                    node.kubernetes.io/instance-type=n2d-standard-2
                    topology.gke.io/zone=us-central1-c
                    topology.kubernetes.io/region=us-central1
                    topology.kubernetes.io/zone=us-central1-c
Annotations:        container.googleapis.com/instance_id: 2168972145624334031
                    csi.volume.kubernetes.io/nodeid:
                      {"pd.csi.storage.gke.io":"projects/skolosov-aerospike-test/zones/us-central1-c/instances/gke-aerospike-skolosov--t--nodepool-1-4475920e-ct...
                    node.alpha.kubernetes.io/ttl: 0
                    node.gke.io/last-applied-node-labels:
                      cloud.google.com/gke-boot-disk=pd-balanced,cloud.google.com/gke-container-runtime=containerd,cloud.google.com/gke-cpu-scaling-level=2,clou...
                    node.gke.io/last-applied-node-taints:
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Tue, 19 Sep 2023 13:10:33 -0700
Taints:             <none>
Unschedulable:      false
Lease:
  HolderIdentity:  gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
  AcquireTime:     <unset>
  RenewTime:       Tue, 19 Sep 2023 15:34:37 -0700
Conditions:
  Type                          Status  LastHeartbeatTime                 LastTransitionTime                Reason                          Message
  ----                          ------  -----------------                 ------------------                ------                          -------
  FrequentDockerRestart         False   Tue, 19 Sep 2023 15:30:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   NoFrequentDockerRestart         docker is functioning properly
  FrequentContainerdRestart     False   Tue, 19 Sep 2023 15:30:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   NoFrequentContainerdRestart     containerd is functioning properly
  CorruptDockerOverlay2         False   Tue, 19 Sep 2023 15:30:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   NoCorruptDockerOverlay2         docker overlay2 is functioning properly
  FrequentUnregisterNetDevice   False   Tue, 19 Sep 2023 15:30:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   NoFrequentUnregisterNetDevice   node is functioning properly
  KernelDeadlock                False   Tue, 19 Sep 2023 15:30:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   KernelHasNoDeadlock             kernel has no deadlock
  ReadonlyFilesystem            False   Tue, 19 Sep 2023 15:30:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   FilesystemIsNotReadOnly         Filesystem is not read-only
  FrequentKubeletRestart        False   Tue, 19 Sep 2023 15:30:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   NoFrequentKubeletRestart        kubelet is functioning properly
  NetworkUnavailable            False   Tue, 19 Sep 2023 13:10:34 -0700   Tue, 19 Sep 2023 13:10:34 -0700   RouteCreated                    NodeController create implicit route
  MemoryPressure                False   Tue, 19 Sep 2023 15:29:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   KubeletHasSufficientMemory      kubelet has sufficient memory available
  DiskPressure                  False   Tue, 19 Sep 2023 15:29:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   KubeletHasNoDiskPressure        kubelet has no disk pressure
  PIDPressure                   False   Tue, 19 Sep 2023 15:29:51 -0700   Tue, 19 Sep 2023 13:10:33 -0700   KubeletHasSufficientPID         kubelet has sufficient PID available
  Ready                         True    Tue, 19 Sep 2023 15:29:51 -0700   Tue, 19 Sep 2023 13:10:34 -0700   KubeletReady                    kubelet is posting ready status. AppArmor enabled
Addresses:
  InternalIP:  10.110.45.30
  ExternalIP:  34.30.166.105
  Hostname:    gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
Capacity:
  cpu:                2
  ephemeral-storage:  16266500Ki
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             8145592Ki
  pods:               32
Allocatable:
  cpu:                1930m
  ephemeral-storage:  4253788136
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             6180536Ki
  pods:               32
System Info:
  Machine ID:                 d99af6d80907f802466ca1adb662b647
  System UUID:                d99af6d8-0907-f802-466c-a1adb662b647
  Boot ID:                    6e256fbd-b4b0-483e-92e9-fcc363a39dc9
  Kernel Version:             5.15.107+
  OS Image:                   Container-Optimized OS from Google
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  containerd://1.6.18
  Kubelet Version:            v1.25.10-gke.2700
  Kube-Proxy Version:         v1.25.10-gke.2700
PodCIDR:                      10.136.243.128/26
PodCIDRs:                     10.136.243.128/26
ProviderID:                   gce://skolosov-aerospike-test/us-central1-c/gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw
Non-terminated Pods:          (7 in total)
  Namespace                   Name                                                              CPU Requests  CPU Limits  Memory Requests  Memory Limits  AGE
  ---------                   ----                                                              ------------  ----------  ---------------  -------------  ---
  kube-system                 fluentbit-gke-t5wzw                                               100m (5%)     0 (0%)      200Mi (3%)       500Mi (8%)     144m
  kube-system                 gke-metrics-agent-c7bvf                                           6m (0%)       0 (0%)      100Mi (1%)       100Mi (1%)     144m
  kube-system                 kube-proxy-gke-aerospike-skolosov--t--nodepool-1-4475920e-ctsw    100m (5%)     0 (0%)      0 (0%)           0 (0%)         144m
  kube-system                 pdcsi-node-tq6ft                                                  10m (0%)      0 (0%)      20Mi (0%)        100Mi (1%)     144m
  mesh-system                 cadvisor-d2cst                                                    300m (15%)    900m (46%)  400Mi (6%)       1224Mi (20%)   144m
  mesh-system                 runtime-security-monitoring-7p6hm                                 110m (5%)     600m (31%)  300Mi (4%)       900Mi (14%)    144m
  test                        rediscluster-update-test-88-6tq2m                                 200m (10%)    1 (51%)     50Mi (0%)        50Mi (0%)      79m
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests      Limits
  --------           --------      ------
  cpu                826m (42%)    2500m (129%)
  memory             1070Mi (17%)  2874Mi (47%)
  ephemeral-storage  0 (0%)        0 (0%)
  hugepages-1Gi      0 (0%)        0 (0%)
  hugepages-2Mi      0 (0%)        0 (0%)
Events:              <none>

skolosov-snap avatar Sep 19 '23 22:09 skolosov-snap

I0919 21:16:50.434129       1 redisnode.go:101] Delaying server start for 5s...
Program started at: 2023-09-19 21:16:50.43341675 +0000 UTC m=+0.029030365
REDIS_VERSION=6.2.6
W0919 21:16:55.434623       1 node.go:207] Config /redis-conf/redis.conf not found
I0919 21:16:55.434949       1 node.go:242] Clearing '/redis-data' folder...
I0919 21:16:55.435158       1 redisnode.go:555] Trying test connection to 10.136.244.15:6379 till 2023-09-19 21:17:05.43514852 +0000 UTC m=+15.030762115
I0919 21:16:55.435287       1 redisnode.go:512] Listening on http://0.0.0.0:8088
E0919 21:16:55.435630       1 redisnode.go:578] Can't create client to 10.136.244.15:6379: dial tcp 10.136.244.15:6379: connect: connection refused
I0919 21:16:55.435730       1 redisnode.go:563] Retrying ping to 10.136.244.15:6379 in 0.5 seconds
13:13:C 19 Sep 2023 21:16:55.449 # oO0OoO0OoO0Oo KeyDB is starting oO0OoO0OoO0Oo
13:13:C 19 Sep 2023 21:16:55.449 # KeyDB version=255.255.255, bits=64, commit=9251f1c2, modified=1, pid=13, just started
13:13:C 19 Sep 2023 21:16:55.449 # Configuration loaded
13:13:M 19 Sep 2023 21:16:55.450 * monotonic clock: POSIX clock_gettime
13:13:M 19 Sep 2023 21:16:55.450 * No cluster configuration found, I'm a0cadd9d591882282267cd556498a9f1f4368a5b
13:13:M 19 Sep 2023 21:16:55.453 * Running mode=cluster, port=6379.
13:13:M 19 Sep 2023 21:16:55.453 # Server initialized
13:21:M 19 Sep 2023 21:16:55.454 * Thread 0 alive.
13:22:M 19 Sep 2023 21:16:55.455 * Thread 1 alive.
I0919 21:16:55.937823       1 redisnode.go:555] Trying test connection to 10.136.244.15:6379 till 2023-09-19 21:17:05.43514852 +0000 UTC m=+15.030762115
I0919 21:16:55.938481       1 redisnode.go:590] Successfully pinged local server at 10.136.244.15:6379.
I0919 21:16:56.449983       1 redisnode.go:227] REDIS Node addresses:10.136.129.140:6379
I0919 21:16:56.450019       1 redisnode.go:227] REDIS Node addresses:10.136.241.13:6379
I0919 21:16:56.450025       1 redisnode.go:227] REDIS Node addresses:10.136.241.14:6379
I0919 21:16:56.450030       1 redisnode.go:227] REDIS Node addresses:10.136.241.216:6379
I0919 21:16:56.450035       1 redisnode.go:227] REDIS Node addresses:10.136.241.76:6379
I0919 21:16:56.450039       1 redisnode.go:227] REDIS Node addresses:10.136.243.12:6379
I0919 21:16:56.450044       1 redisnode.go:227] REDIS Node addresses:10.136.243.146:6379
I0919 21:16:56.450050       1 redisnode.go:227] REDIS Node addresses:10.136.244.141:6379
I0919 21:16:56.450057       1 redisnode.go:227] REDIS Node addresses:10.136.244.202:6379
I0919 21:16:56.450061       1 redisnode.go:227] REDIS Node addresses:10.136.244.79:6379
I0919 21:16:56.468318       1 redisnode.go:200] Attaching node 10.136.244.15:6379 to the cluster
I0919 21:16:56.468350       1 admin.go:213] Attaching node 10.136.244.15:6379 to the cluster.
I0919 21:16:56.469656       1 admin.go:246] node 10.136.244.15:6379 attached properly
I0919 21:16:56.469715       1 redisnode.go:213] RedisNode: Running properly
I0919 21:16:56.469746       1 redisnode.go:122] Awaiting stop signal
13:21:M 19 Sep 2023 21:16:56.470 # IP address for this node updated to 10.136.244.15
13:21:M 19 Sep 2023 21:16:59.493 # Cluster state changed: ok
I0919 21:17:14.716124       1 redisnode.go:439] Readiness probe ok
I0919 21:17:14.716517       1 redisnode.go:457] Liveness probe ok
13:22:M 19 Sep 2023 21:17:40.900 * Replica 10.136.129.145:6379 asks for synchronization
13:22:M 19 Sep 2023 21:17:40.900 * Partial resynchronization not accepted: Replication ID mismatch (Replica asked for '59198a1f3c7c39efe03d0de2cf64af6b2abdde26', my replication IDs are '60fddad9b8c101d756f06341b2d02ab3d036629f' and '0000000000000000000000000000000000000000')
13:22:M 19 Sep 2023 21:17:40.900 * Replication backlog created, my new replication IDs are '6259fa4c9af68a358fd7d4f20804f33b84d847e8' and '0000000000000000000000000000000000000000'
13:22:M 19 Sep 2023 21:17:40.900 * Starting BGSAVE for SYNC with target: disk
13:22:M 19 Sep 2023 21:17:40.918 * Background saving started by pid 28
13:22:M 19 Sep 2023 21:17:40.918 * Background saving started
13:21:M 19 Sep 2023 21:17:40.921 * Replica 10.136.243.146:6379 asks for synchronization
13:21:M 19 Sep 2023 21:17:40.922 * Partial resynchronization not accepted: Replication ID mismatch (Replica asked for 'b4b77bc2319ace4bd216d05397828e6c9a510cde', my replication IDs are '6259fa4c9af68a358fd7d4f20804f33b84d847e8' and '0000000000000000000000000000000000000000')
13:21:M 19 Sep 2023 21:17:40.922 * Waiting for end of BGSAVE for SYNC
13:21:M 19 Sep 2023 21:20:46.513 # Failover auth granted to 3552bbd8ed39dc1322dcfd43827fc2d65f1729db for epoch 13
13:21:M 19 Sep 2023 21:23:03.417 # Failover auth granted to 483c38ddb492e0c84a3d21a6b66925dbecb58f4c for epoch 14
13:21:M 19 Sep 2023 21:25:11.462 # Failover auth granted to f30037c4afef7a5ba19c810b1b4768bc8946b53d for epoch 15
13:21:M 19 Sep 2023 21:25:29.705 # Failover auth granted to 483c38ddb492e0c84a3d21a6b66925dbecb58f4c for epoch 16
13:21:M 19 Sep 2023 21:29:13.016 * FAIL message received from 65da31685e82c575774b7d3345df79c004658bbe about f30037c4afef7a5ba19c810b1b4768bc8946b53d

skolosov-snap avatar Sep 19 '23 22:09 skolosov-snap

skolosov@C02F34L1MD6R ~ % cat ~/rediscluster-update-test-88-6tq2m.logs
Program started at: 2023-09-19 21:15:48.051282077 +0000 UTC m=+0.019104310
REDIS_VERSION=6.2.6
I0919 21:15:48.051777       1 redisnode.go:101] Delaying server start for 5s...
W0919 21:15:53.052298       1 node.go:207] Config /redis-conf/redis.conf not found
I0919 21:15:53.052614       1 node.go:242] Clearing '/redis-data' folder...
I0919 21:15:53.052780       1 redisnode.go:555] Trying test connection to 10.136.243.146:6379 till 2023-09-19 21:16:03.052760049 +0000 UTC m=+15.020582262
I0919 21:15:53.052948       1 redisnode.go:512] Listening on http://0.0.0.0:8088
E0919 21:15:53.053547       1 redisnode.go:578] Can't create client to 10.136.243.146:6379: dial tcp 10.136.243.146:6379: connect: connection refused
I0919 21:15:53.053565       1 redisnode.go:563] Retrying ping to 10.136.243.146:6379 in 0.5 seconds
14:14:C 19 Sep 2023 21:15:53.067 # oO0OoO0OoO0Oo KeyDB is starting oO0OoO0OoO0Oo
14:14:C 19 Sep 2023 21:15:53.067 # KeyDB version=255.255.255, bits=64, commit=9251f1c2, modified=1, pid=14, just started
14:14:C 19 Sep 2023 21:15:53.067 # Configuration loaded
14:14:M 19 Sep 2023 21:15:53.068 * monotonic clock: POSIX clock_gettime
14:14:M 19 Sep 2023 21:15:53.068 * No cluster configuration found, I'm 1788d52122f37bbb5ccad866dea82be37dee36ea
14:14:M 19 Sep 2023 21:15:53.071 * Running mode=cluster, port=6379.
14:14:M 19 Sep 2023 21:15:53.071 # Server initialized
14:23:M 19 Sep 2023 21:15:53.072 * Thread 0 alive.
14:24:M 19 Sep 2023 21:15:53.072 * Thread 1 alive.
I0919 21:15:53.554156       1 redisnode.go:555] Trying test connection to 10.136.243.146:6379 till 2023-09-19 21:16:03.052760049 +0000 UTC m=+15.020582262
I0919 21:15:53.554905       1 redisnode.go:590] Successfully pinged local server at 10.136.243.146:6379.
I0919 21:15:54.065217       1 redisnode.go:227] REDIS Node addresses:10.136.129.140:6379
I0919 21:15:54.065250       1 redisnode.go:227] REDIS Node addresses:10.136.241.13:6379
I0919 21:15:54.065256       1 redisnode.go:227] REDIS Node addresses:10.136.241.14:6379
I0919 21:15:54.065262       1 redisnode.go:227] REDIS Node addresses:10.136.241.216:6379
I0919 21:15:54.065267       1 redisnode.go:227] REDIS Node addresses:10.136.241.76:6379
I0919 21:15:54.065272       1 redisnode.go:227] REDIS Node addresses:10.136.243.12:6379
I0919 21:15:54.065277       1 redisnode.go:227] REDIS Node addresses:10.136.244.141:6379
I0919 21:15:54.065282       1 redisnode.go:227] REDIS Node addresses:10.136.244.202:6379
I0919 21:15:54.065286       1 redisnode.go:227] REDIS Node addresses:10.136.244.79:6379
I0919 21:15:54.078654       1 redisnode.go:200] Attaching node 10.136.243.146:6379 to the cluster
I0919 21:15:54.078683       1 admin.go:213] Attaching node 10.136.243.146:6379 to the cluster.
I0919 21:15:54.080145       1 admin.go:246] node 10.136.243.146:6379 attached properly
I0919 21:15:54.080197       1 redisnode.go:213] RedisNode: Running properly
I0919 21:15:54.080217       1 redisnode.go:122] Awaiting stop signal
14:23:M 19 Sep 2023 21:15:54.096 # IP address for this node updated to 10.136.243.146
14:23:M 19 Sep 2023 21:15:57.102 # Cluster state changed: ok
I0919 21:16:12.367623       1 redisnode.go:457] Liveness probe ok
I0919 21:16:12.368624       1 redisnode.go:439] Readiness probe ok
14:23:S 19 Sep 2023 21:17:40.897 * Before turning into a replica, using my own master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer.
14:23:S 19 Sep 2023 21:17:40.898 * Connecting to MASTER 10.136.244.15:6379
14:23:S 19 Sep 2023 21:17:40.898 * MASTER <-> REPLICA sync started
14:23:S 19 Sep 2023 21:17:40.898 * Non blocking connect for SYNC fired the event.
14:23:S 19 Sep 2023 21:17:40.901 * Master replied to PING, replication can continue...
14:23:S 19 Sep 2023 21:17:40.921 * Trying a partial resynchronization (request b4b77bc2319ace4bd216d05397828e6c9a510cde:1).
14:23:S 19 Sep 2023 21:17:40.922 * Full resync from master: 6259fa4c9af68a358fd7d4f20804f33b84d847e8:0
14:23:S 19 Sep 2023 21:17:40.922 * Discarding previously cached master state.
NOTICE: Detuning locks due to high load per core: 109.23%
NOTICE: Detuning locks due to high load per core: 95.90%
NOTICE: Detuning locks due to high load per core: 101.46%
14:23:S 19 Sep 2023 21:29:13.016 * FAIL message received from 65da31685e82c575774b7d3345df79c004658bbe about f30037c4afef7a5ba19c810b1b4768bc8946b53d

skolosov-snap avatar Sep 19 '23 22:09 skolosov-snap

skolosov@C02F34L1MD6R ~ % cat ~/rediscluster-update-test-88-6tq2m.info
# Server
redis_version:255.255.255
redis_git_sha1:9251f1c2
redis_git_dirty:1
redis_build_id:b2dd10e11e590621
redis_mode:cluster
os:Linux 5.15.107+ x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:9.4.0
process_id:14
process_supervised:no
run_id:f4af81473588adb1f1a1ad868ef256ef34fb20d6
tcp_port:6379
server_time_usec:1695161887678526
uptime_in_seconds:3734
uptime_in_days:0
hz:10
configured_hz:10
lru_clock:663071
executable:/data/keydb-server
config_file:/redis-conf/redis.conf
availability_zone:us-central1-c
features:cluster_mget

# Clients
connected_clients:1
cluster_connections:26
maxclients:1000
client_recent_max_input_buffer:0
client_recent_max_output_buffer:0
blocked_clients:0
tracking_clients:0
clients_in_timeout_table:0
current_client_thread:0
thread_0_clients:1
thread_1_clients:0

# Memory
used_memory:1505368
used_memory_human:1.44M
used_memory_rss:15998976
used_memory_rss_human:15.26M
used_memory_peak:1648056
used_memory_peak_human:1.57M
used_memory_peak_perc:91.34%
used_memory_overhead:1237256
used_memory_startup:1237256
used_memory_dataset:268112
used_memory_dataset_perc:100.00%
allocator_allocated:2440600
allocator_active:3211264
allocator_resident:5980160
total_system_memory:8341086208
total_system_memory_human:7.77G
used_memory_lua:37888
used_memory_lua_human:37.00K
used_memory_scripts:0
used_memory_scripts_human:0B
number_of_cached_scripts:0
maxmemory:20000000
maxmemory_human:19.07M
maxmemory_policy:noeviction
allocator_frag_ratio:1.32
allocator_frag_bytes:770664
allocator_rss_ratio:1.86
allocator_rss_bytes:2768896
rss_overhead_ratio:2.68
rss_overhead_bytes:10018816
mem_fragmentation_ratio:11.10
mem_fragmentation_bytes:14557400
mem_not_counted_for_evict:0
mem_replication_backlog:0
mem_clients_slaves:0
mem_clients_normal:0
mem_aof_buffer:0
mem_allocator:jemalloc-5.2.1
active_defrag_running:0
lazyfree_pending_objects:0
lazyfreed_objects:0
storage_provider:none
available_system_memory:unavailable

# Persistence
loading:0
current_cow_size:0
current_cow_size_age:0
current_fork_perc:0.00
current_save_keys_processed:0
current_save_keys_total:0
rdb_changes_since_last_save:0
rdb_bgsave_in_progress:0
rdb_last_save_time:1695158153
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:-1
rdb_current_bgsave_time_sec:-1
rdb_last_cow_size:0
aof_enabled:0
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:-1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:0
module_fork_in_progress:0
module_fork_last_cow_size:0

# Stats
total_connections_received:2453
total_commands_processed:1572
instantaneous_ops_per_sec:0
total_net_input_bytes:46154
total_net_output_bytes:3143735
instantaneous_input_kbps:0.02
instantaneous_output_kbps:0.93
rejected_connections:0
sync_full:0
sync_partial_ok:0
sync_partial_err:0
expired_keys:0
expired_stale_perc:0.00
expired_time_cap_reached_count:0
expire_cycle_cpu_milliseconds:1
evicted_keys:0
keyspace_hits:0
keyspace_misses:0
pubsub_channels:0
pubsub_patterns:0
latest_fork_usec:0
total_forks:0
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
tracking_total_keys:0
tracking_total_items:0
tracking_total_prefixes:0
unexpected_error_replies:0
total_error_replies:0
dump_payload_sanitizations:0
total_reads_processed:4025
total_writes_processed:1572
instantaneous_lock_contention:1
avg_lock_contention:0.390625
storage_provider_read_hits:0
storage_provider_read_misses:0

# Replication
role:slave
master_global_link_status:down
connected_masters:0
master_host:10.136.244.15
master_port:6379
master_link_status:down
master_last_io_seconds_ago:-1
master_sync_in_progress:1
slave_read_repl_offset:1
slave_repl_offset:1
master_sync_total_bytes:-1
master_sync_read_bytes:0
master_sync_left_bytes:-1
master_sync_perc:-0.00
master_sync_last_io_seconds_ago:0
master_link_down_since_seconds:-1
slave_priority:100
slave_read_only:1
replica_announced:1
connected_slaves:0
master_failover_state:no-failover
master_replid:b4b77bc2319ace4bd216d05397828e6c9a510cde
master_replid2:0000000000000000000000000000000000000000
master_repl_offset:0
second_repl_offset:-1
repl_backlog_active:0
repl_backlog_size:1000000
repl_backlog_first_byte_offset:0
repl_backlog_histlen:0

# CPU
used_cpu_sys:11.910977
used_cpu_user:11.529626
used_cpu_sys_children:0.000000
used_cpu_user_children:0.000000
server_threads:2
long_lock_waits:0
used_cpu_sys_main_thread:8.090785
used_cpu_user_main_thread:7.544344

# Modules

# Errorstats

# Cluster
cluster_enabled:1

# Keyspace

# KeyDB
mvcc_depth:0

skolosov-snap avatar Sep 19 '23 22:09 skolosov-snap

skolosov@C02F34L1MD6R ~ % cat ~/rediscluster-update-test-88-4cgf6.info
# Server
redis_version:255.255.255
redis_git_sha1:9251f1c2
redis_git_dirty:1
redis_build_id:b2dd10e11e590621
redis_mode:cluster
os:Linux 5.15.107+ x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:9.4.0
process_id:13
process_supervised:no
run_id:768c779b42761eadd13ee4750ea4567edf89a042
tcp_port:6379
server_time_usec:1695161910951564
uptime_in_seconds:3695
uptime_in_days:0
hz:10
configured_hz:10
lru_clock:663094
executable:/data/keydb-server
config_file:/redis-conf/redis.conf
availability_zone:us-central1-b
features:cluster_mget

# Clients
connected_clients:1
cluster_connections:26
maxclients:1000
client_recent_max_input_buffer:32
client_recent_max_output_buffer:0
blocked_clients:0
tracking_clients:0
clients_in_timeout_table:0
current_client_thread:1
thread_0_clients:1
thread_1_clients:2

# Memory
used_memory:2635976
used_memory_human:2.51M
used_memory_rss:16084992
used_memory_rss_human:15.34M
used_memory_peak:2779856
used_memory_peak_human:2.65M
used_memory_peak_perc:94.82%
used_memory_overhead:2326856
used_memory_startup:1237256
used_memory_dataset:309120
used_memory_dataset_perc:22.10%
allocator_allocated:3574288
allocator_active:4464640
allocator_resident:7233536
total_system_memory:8341086208
total_system_memory_human:7.77G
used_memory_lua:37888
used_memory_lua_human:37.00K
used_memory_scripts:0
used_memory_scripts_human:0B
number_of_cached_scripts:0
maxmemory:20000000
maxmemory_human:19.07M
maxmemory_policy:noeviction
allocator_frag_ratio:1.25
allocator_frag_bytes:890352
allocator_rss_ratio:1.62
allocator_rss_bytes:2768896
rss_overhead_ratio:2.22
rss_overhead_bytes:8851456
mem_fragmentation_ratio:6.25
mem_fragmentation_bytes:13512848
mem_not_counted_for_evict:0
mem_replication_backlog:1048576
mem_clients_slaves:41024
mem_clients_normal:0
mem_aof_buffer:0
mem_allocator:jemalloc-5.2.1
active_defrag_running:0
lazyfree_pending_objects:0
lazyfreed_objects:0
storage_provider:none
available_system_memory:unavailable

# Persistence
loading:0
current_cow_size:0
current_cow_size_age:0
current_fork_perc:0.00
current_save_keys_processed:0
current_save_keys_total:0
rdb_changes_since_last_save:0
rdb_bgsave_in_progress:1
rdb_last_save_time:1695158215
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:-1
rdb_current_bgsave_time_sec:3650
rdb_last_cow_size:0
aof_enabled:0
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:-1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:0
module_fork_in_progress:0
module_fork_last_cow_size:0

# Stats
total_connections_received:2430
total_commands_processed:9761
instantaneous_ops_per_sec:0
total_net_input_bytes:865774
total_net_output_bytes:3124035
instantaneous_input_kbps:0.02
instantaneous_output_kbps:0.92
rejected_connections:0
sync_full:2
sync_partial_ok:0
sync_partial_err:2
expired_keys:0
expired_stale_perc:0.00
expired_time_cap_reached_count:0
expire_cycle_cpu_milliseconds:48
evicted_keys:0
keyspace_hits:0
keyspace_misses:0
pubsub_channels:0
pubsub_patterns:0
latest_fork_usec:17424
total_forks:1
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
tracking_total_keys:0
tracking_total_items:0
tracking_total_prefixes:0
unexpected_error_replies:0
total_error_replies:0
dump_payload_sanitizations:0
total_reads_processed:12187
total_writes_processed:9757
instantaneous_lock_contention:1
avg_lock_contention:0.484375
storage_provider_read_hits:0
storage_provider_read_misses:0

# Replication
role:master
connected_slaves:2
slave0:ip=10.136.129.145,port=6379,state=wait_bgsave,offset=0,lag=0
slave1:ip=10.136.243.146,port=6379,state=wait_bgsave,offset=0,lag=0
master_failover_state:no-failover
master_replid:6259fa4c9af68a358fd7d4f20804f33b84d847e8
master_replid2:0000000000000000000000000000000000000000
master_repl_offset:5105
second_repl_offset:-1
repl_backlog_active:1
repl_backlog_size:1000000
repl_backlog_first_byte_offset:1
repl_backlog_histlen:5105

# CPU
used_cpu_sys:14.675105
used_cpu_user:15.011243
used_cpu_sys_children:0.000000
used_cpu_user_children:0.000000
server_threads:2
long_lock_waits:0
used_cpu_sys_main_thread:1.864041
used_cpu_user_main_thread:2.880313

# Modules

# Errorstats

# Cluster
cluster_enabled:1

# Keyspace

# KeyDB
mvcc_depth:0

skolosov-snap avatar Sep 19 '23 22:09 skolosov-snap