redis-cluster-operator icon indicating copy to clipboard operation
redis-cluster-operator copied to clipboard

CLUSTER FORGET errors

Open bsergean opened this issue 4 years ago • 4 comments

My cluster got in a bad state, it's unclear why.

  • very recent redis-cluster-operator (from 2 weeks ago)
  • OpenShift 3.11 which is built on kubernete 3.11.
  • redis-5.0.8
  • 3 master 3 replicas
{"level":"info","ts":1592421784.707177,"logger":"controller_distributedrediscluster.redis_util","msg":"Forget node done","Request.Namespace":"cobra-bench","Request.Name":"redis","node":"dea890e5848908b58d70b2911ce69e037cf88772"}
{"level":"info","ts":1592421784.707182,"logger":"controller_distributedrediscluster","msg":"[FixUntrustedNodes] found untrust node","Request.Namespace":"cobra-bench","Request.Name":"redis","node":"{Redis ID: 712df7a404a1aa19fd966f3f7d0ccc68aacb48bb, role: None, master: , link: connected, status: [handshake], addr: 172.25.197.16:6379, slots: [], len(migratingSlots): 0, len(importingSlots): 0}"}
{"level":"info","ts":1592421784.7072167,"logger":"controller_distributedrediscluster","msg":"[FixUntrustedNodes] try to forget node","Request.Namespace":"cobra-bench","Request.Name":"redis","nodeId":"712df7a404a1aa19fd966f3f7d0ccc68aacb48bb"}
{"level":"info","ts":1592421785.1034586,"logger":"controller_distributedrediscluster.redis_util","msg":"CLUSTER FORGET","Request.Namespace":"cobra-bench","Request.Name":"redis","id":"712df7a404a1aa19fd966f3f7d0ccc68aacb48bb","from":"172.28.152.78:6379"}
{"level":"info","ts":1592421785.105386,"logger":"controller_distributedrediscluster.redis_util","msg":"CLUSTER FORGET","Request.Namespace":"cobra-bench","Request.Name":"redis","id":"712df7a404a1aa19fd966f3f7d0ccc68aacb48bb","from":"172.29.215.140:6379"}
{"level":"error","ts":1592421785.1057348,"logger":"controller_distributedrediscluster","msg":"Unable to execute FORGET command: unexpected error on node 172.29.215.140:6379","Request.Namespace":"cobra-bench","Request.Name":"redis","error":"ERR Unknown node 712df7a404a1aa19fd966f3f7d0ccc68aacb48bb","stacktrace":"github.com/go-logr/zapr.(*zapLogger).Error\n\t/go/pkg/mod/github.com/go-logr/[email protected]/zapr.go:128\ngithub.com/ucloud/redis-cluster-operator/pkg/redisutil.(*AdminConnections).ValidateResp\n\t/src/pkg/redisutil/connections.go:296\ngithub.com/ucloud/redis-cluster-operator/pkg/redisutil.(*Admin).ForgetNode\n\t/src/pkg/redisutil/admin.go:480\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/heal.(*CheckAndHeal).FixUntrustedNodes\n\t/src/pkg/controller/heal/untrustenodes.go:46\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/manager.(*realHeal).Heal\n\t/src/pkg/controller/manager/healer.go:31\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/distributedrediscluster.(*ReconcileDistributedRedisCluster).Reconcile\n\t/src/pkg/controller/distributedrediscluster/distributedrediscluster_controller.go:248\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).reconcileHandler\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:256\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).processNextWorkItem\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:232\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).worker\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:211\nk8s.io/apimachinery/pkg/util/wait.JitterUntil.func1\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:152\nk8s.io/apimachinery/pkg/util/wait.JitterUntil\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:153\nk8s.io/apimachinery/pkg/util/wait.Until\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:88"}
{"level":"info","ts":1592421785.105893,"logger":"controller_distributedrediscluster.redis_util","msg":"CLUSTER FORGET","Request.Namespace":"cobra-bench","Request.Name":"redis","id":"712df7a404a1aa19fd966f3f7d0ccc68aacb48bb","from":"172.28.5.80:6379"}
{"level":"error","ts":1592421785.1062799,"logger":"controller_distributedrediscluster","msg":"Unable to execute FORGET command: unexpected error on node 172.28.5.80:6379","Request.Namespace":"cobra-bench","Request.Name":"redis","error":"ERR Unknown node 712df7a404a1aa19fd966f3f7d0ccc68aacb48bb","stacktrace":"github.com/go-logr/zapr.(*zapLogger).Error\n\t/go/pkg/mod/github.com/go-logr/[email protected]/zapr.go:128\ngithub.com/ucloud/redis-cluster-operator/pkg/redisutil.(*AdminConnections).ValidateResp\n\t/src/pkg/redisutil/connections.go:296\ngithub.com/ucloud/redis-cluster-operator/pkg/redisutil.(*Admin).ForgetNode\n\t/src/pkg/redisutil/admin.go:480\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/heal.(*CheckAndHeal).FixUntrustedNodes\n\t/src/pkg/controller/heal/untrustenodes.go:46\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/manager.(*realHeal).Heal\n\t/src/pkg/controller/manager/healer.go:31\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/distributedrediscluster.(*ReconcileDistributedRedisCluster).Reconcile\n\t/src/pkg/controller/distributedrediscluster/distributedrediscluster_controller.go:248\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).reconcileHandler\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:256\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).processNextWorkItem\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:232\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).worker\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:211\nk8s.io/apimachinery/pkg/util/wait.JitterUntil.func1\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:152\nk8s.io/apimachinery/pkg/util/wait.JitterUntil\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:153\nk8s.io/apimachinery/pkg/util/wait.Until\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:88"}

There were 0 restarts:

NAME                                      READY   STATUS    RESTARTS   AGE
cobra-6-mbfqx                             1/1     Running   0          18d
drc-redis-0-0                             1/1     Running   0          18d
drc-redis-0-1                             1/1     Running   0          18d
drc-redis-1-0                             1/1     Running   0          18d
drc-redis-1-1                             1/1     Running   0          18d
drc-redis-2-0                             1/1     Running   0          15d
drc-redis-2-1                             1/1     Running   0          15d
redis-cluster-operator-7cc9cd4489-cmd4g   1/1     Running   0          18d

But I see 4 master and slaves with CLUSTER NODES

/data # redis-cli -u redis://172.16.101.247 -c CLUSTER NODES | awk '{print $2, $3}'
172.25.197.16:6379@16379 slave,fail
172.29.215.140:6379@16379 slave
172.28.5.80:6379@16379 slave
172.31.121.77:6379@16379 master
172.30.225.140:6379@16379 master
172.28.152.78:6379@16379 master
172.30.160.203:6379@16379 master,fail
172.30.112.12:6379@16379 myself,master

Any ideas, guess on how to investigate this ?

bsergean avatar Jun 17 '20 19:06 bsergean

I'm looking at the logs of the operator, and I see TONS of such errors.

{"level":"info","ts":1590868431.809033,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1590868492.4941754,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
W0530 19:55:01.976103       1 reflector.go:299] pkg/mod/k8s.io/[email protected]/tools/cache/reflector.go:96: watch of *v1.Job ended with: very short watch: pkg/mod/k8s.io/[email protected]/tools/cache/reflector.go:96: Unexpected watch close - watch lasted less than a second and no items received
W0530 19:55:39.122065       1 reflector.go:299] pkg/mod/k8s.io/[email protected]/tools/cache/reflector.go:96: watch of *v1alpha1.RedisClusterBackup ended with: very short watch: pkg/mod/k8s.io/[email protected]/tools/cache/reflector.go:96: Unexpected watch close - watch lasted less than a second and no items received
{"level":"info","ts":1590868552.9949102,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1590868613.3899803,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
W0530 19:57:45.137573       1 reflector.go:299] pkg/mod/k8s.io/[email protected]/tools/cache/reflector.go:96: watch of *unstructured.Unstructured ended with: very short watch: pkg/mod/k8s.io/[email protected]/tools/cache/reflector.go:96: Unexpected watch close - watch lasted less than a second and no items received

bsergean avatar Jun 17 '20 19:06 bsergean

Before the FORGET error start, this error showed up:

{"level":"info","ts":1591115256.7091894,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115317.2034957,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115377.7051213,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115438.3920841,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115438.4025192,"logger":"controller_distributedrediscluster","msg":"waitPodReady","Request.Namespace":"cobra-bench","Request.Name":"redis","err":"CheckRedisNodeNum: redis pods are not all ready"}
{"level":"info","ts":1591115448.421646,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115448.4320798,"logger":"controller_distributedrediscluster","msg":"waitPodReady","Request.Namespace":"cobra-bench","Request.Name":"redis","err":"CheckRedisNodeNum: redis pods are not all ready"}
{"level":"info","ts":1591115458.4324322,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115458.443078,"logger":"controller_distributedrediscluster","msg":"waitPodReady","Request.Namespace":"cobra-bench","Request.Name":"redis","err":"CheckRedisNodeNum: redis pods are not all ready"}
{"level":"info","ts":1591115468.4434261,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115468.4537761,"logger":"controller_distributedrediscluster","msg":"waitPodReady","Request.Namespace":"cobra-bench","Request.Name":"redis","err":"CheckRedisNodeNum: redis pods are not all ready"}
{"level":"info","ts":1591115478.4540656,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"cobra-bench","Request.Name":"redis"}
{"level":"info","ts":1591115478.4647434,"logger":"controller_distributedrediscluster","msg":"waitPodReady","Request.Namespace":"cobra-bench","Request.Name":"redis","err":"CheckRedisNodeNum: redis pods are not all ready"}

Then on message about Forgetting failed node, this command might fail, this is not an error

{"level":"info","ts":1591115690.2306173,"logger":"controller_distributedrediscluster","msg":"[FixFailedNodes] Forgetting failed node, this command might fail, this is not an error","Request.Namespace":"cobra-bench","Request.Name":"redis","node":"6cb0b32912a84fd7c343e53f7523ac91b8a5f0c9"}

Then the usual

{"level":"info","ts":1591115690.2306411,"logger":"controller_distributedrediscluster","msg":"[FixFailedNodes] try to forget node","Request.Namespace":"cobra-bench","Request.Name":"redis","nodeId":"6cb0b32912a84fd7c343e53f7523ac91b8a5f0c9"}
{"level":"info","ts":1591115690.4067843,"logger":"controller_distributedrediscluster.redis_util","msg":"CLUSTER FORGET","Request.Namespace":"cobra-bench","Request.Name":"redis","id":"6cb0b32912a84fd7c343e53f7523ac91b8a5f0c9","from":"172.28.5.80:6379"}
{"level":"info","ts":1591115690.4896011,"logger":"controller_distributedrediscluster.redis_util","msg":"CLUSTER FORGET","Request.Namespace":"cobra-bench","Request.Name":"redis","id":"6cb0b32912a84fd7c343e53f7523ac91b8a5f0c9","from":"172.29.197.204:6379"}

Note that the log is enormous now, about 300M of that same error message.

bsergean avatar Jun 18 '20 00:06 bsergean

When testing I rolled all the shard stateful sets one by one manually by the way of adding a nodeSelector which the operator didn't pick up as a change to the CRD and this issue started happening too.

excieve avatar Jun 30 '20 20:06 excieve

i met the same question. the deployment redis-cluster-operator is successful. i used conmand kubectl apply -f redis.kun_v1alpha1_distributedrediscluster_cr.yaml, All pods are running, but the distributedredisclusters status is Scaling. The reason is "CheckRedisNodeNum: redis pods are not all ready". In addition, the operator logs are as follows:

{"level":"info","ts":1614844714.8602507,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"xie","Request.Name":"rediscluster"}
{"level":"info","ts":1614844714.8717122,"logger":"controller_distributedrediscluster","msg":">>> Sending CLUSTER MEET messages to join the cluster","Request.Namespace":"xie","Request.Name":"rediscluster"}
{"level":"info","ts":1614844714.8730202,"logger":"controller_distributedrediscluster.redis_util","msg":"node 179.20.3.190:6379 attached properly","Request.Namespace":"xie","Request.Name":"rediscluster"}
{"level":"info","ts":1614844715.8760886,"logger":"controller_distributedrediscluster","msg":"requeue","Request.Namespace":"xie","Request.Name":"rediscluster","err":"wait for cluster join: Cluster view is inconsistent"}
{"level":"info","ts":1614844725.8764315,"logger":"controller_distributedrediscluster","msg":"Reconciling DistributedRedisCluster","Request.Namespace":"xie","Request.Name":"rediscluster"}
{"level":"info","ts":1614844725.8848197,"logger":"controller_distributedrediscluster","msg":"[FixFailedNodes] Forgetting failed node, this command might fail, this is not an error","Request.Namespace":"xie","Request.Name":"rediscluster","node":"9a71dc6b66acc16b248f8bba835e0aafbaf9ae09"}
{"level":"info","ts":1614844725.884851,"logger":"controller_distributedrediscluster","msg":"[FixFailedNodes] try to forget node","Request.Namespace":"xie","Request.Name":"rediscluster","nodeId":"9a71dc6b66acc16b248f8bba835e0aafbaf9ae09"}
{"level":"info","ts":1614844725.8873448,"logger":"controller_distributedrediscluster.redis_util","msg":"CLUSTER FORGET","Request.Namespace":"xie","Request.Name":"rediscluster","id":"9a71dc6b66acc16b248f8bba835e0aafbaf9ae09","from":"179.20.3.190:6379"}
{"level":"error","ts":1614844756.9312086,"logger":"controller_distributedrediscluster","msg":"Unable to execute FORGET command: unexpected error on node 179.20.2.106:6379","Request.Namespace":"xie","Request.Name":"rediscluster","error":"ERR Unknown node 1ddf3624a988442ef7c321635920b514f14daae2","stacktrace":"github.com/go-logr/zapr.(*zapLogger).Error\n\t/go/pkg/mod/github.com/go-logr/[email protected]/zapr.go:128\ngithub.com/ucloud/redis-cluster-operator/pkg/redisutil.(*AdminConnections).ValidateResp\n\t/src/pkg/redisutil/connections.go:296\ngithub.com/ucloud/redis-cluster-operator/pkg/redisutil.(*Admin).ForgetNode\n\t/src/pkg/redisutil/admin.go:508\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/heal.(*CheckAndHeal).FixUntrustedNodes\n\t/src/pkg/controller/heal/untrustenodes.go:46\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/manager.(*realHeal).Heal\n\t/src/pkg/controller/manager/healer.go:31\ngithub.com/ucloud/redis-cluster-operator/pkg/controller/distributedrediscluster.(*ReconcileDistributedRedisCluster).Reconcile\n\t/src/pkg/controller/distributedrediscluster/distributedrediscluster_controller.go:248\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).reconcileHandler\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:256\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).processNextWorkItem\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:232\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).worker\n\t/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:211\nk8s.io/apimachinery/pkg/util/wait.JitterUntil.func1\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:152\nk8s.io/apimachinery/pkg/util/wait.JitterUntil\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:153\nk8s.io/apimachinery/pkg/util/wait.Until\n\t/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:88"}

oceanxie avatar Mar 04 '21 08:03 oceanxie