valkey-io · enjoy-binbin · Oct 18, 2024 · Oct 19, 2024 · Oct 19, 2024
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -654,7 +654,7 @@ int clusterLoadConfig(char *filename) {
             } else if (!strcasecmp(s, "handshake")) {
                 n->flags |= CLUSTER_NODE_HANDSHAKE;
             } else if (!strcasecmp(s, "noaddr")) {
-                n->flags |= CLUSTER_NODE_NOADDR;
+                n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
             } else if (!strcasecmp(s, "nofailover")) {
                 n->flags |= CLUSTER_NODE_NOFAILOVER;
             } else if (!strcasecmp(s, "noflags")) {
@@ -3244,19 +3244,28 @@ int clusterProcessPacket(clusterLink *link) {
             } else if (memcmp(link->node->name, hdr->sender, CLUSTER_NAMELEN) != 0) {
                 /* If the reply has a non matching node ID we
                  * disconnect this node and set it as not having an associated
-                 * address. */
+                 * address. This can happen if the node did CLUSTER RESET and changed
+                 * its node ID. In this case, the old node ID will not come back.
+                 *
+                 * We will also mark the node as fail because we have disconnected from it,
+                 * and will not reconnect, and obviously we will not gossip NOADDR nodes.
+                 * Marking it as FAIL can help us advance the state, such as the cluster
+                 * state becomes FAIL or the replica can do the failover. Otherwise, the
+                 * NOADDR node will provide an invalid address in redirection and confuse
+                 * the clients, and the replica will never initiate a failover since the
+                 * node is not actually in FAIL state. */
                 serverLog(LL_NOTICE,
                           "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, "
                           "having flags %d",
                           link->node->name, link->node->human_nodename, link->node->shard_id,
                           (int)(now - (link->node->ctime)), link->node->flags);
-                link->node->flags |= CLUSTER_NODE_NOADDR;
+                link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
                 link->node->ip[0] = '\0';
                 link->node->tcp_port = 0;
                 link->node->tls_port = 0;
                 link->node->cport = 0;
                 freeClusterLink(link);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
                 return 0;
             }
         }

diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl
@@ -0,0 +1,44 @@
+start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} {
+    test "NOADDR nodes will be marked as FAIL" {
+        set primary0_id [R 0 CLUSTER MYID]
+
+        # R 0 is a primary, doing a CLUSTER RESET and the node name will be modified,
+        # and other nodes will set it to NOADDR.
+        R 0 cluster reset hard
+        wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10
+        wait_for_condition 1000 10 {
+            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 &&
+            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1
+        } else {
+            fail "The node is not marked with the correct flag"
+        }
+
+        # Also we will set the node to FAIL, so the cluster will eventually be down.
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {fail} &&
+            [CI 2 cluster_state] eq {fail} &&
+            [CI 3 cluster_state] eq {fail} &&
+            [CI 4 cluster_state] eq {fail} &&
+            [CI 5 cluster_state] eq {fail}
+        } else {
+            fail "Cluster doesn't fail"
+        }
+
+        # key_977613 belong to slot 0 and belong to R 0.
+        # Make sure we get a CLUSTER DOWN instead of an invalid MOVED.
+        assert_error {CLUSTERDOWN*} {R 1 set key_977613 bar}
+
+        # Let the replica 3 do the failover.
+        R 3 config set cluster-replica-validity-factor 0
+        R 3 config set cluster-replica-no-failover no
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok} &&
+            [CI 3 cluster_state] eq {ok} &&
+            [CI 4 cluster_state] eq {ok} &&
+            [CI 5 cluster_state] eq {ok}
+        } else {
+            fail "Cluster doesn't stabilize"
+        }
+    }
+} ;# start_cluster