diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 14f8a6bd1e..17e8fdde4c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -654,7 +654,7 @@ int clusterLoadConfig(char *filename) { } else if (!strcasecmp(s, "handshake")) { n->flags |= CLUSTER_NODE_HANDSHAKE; } else if (!strcasecmp(s, "noaddr")) { - n->flags |= CLUSTER_NODE_NOADDR; + n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); } else if (!strcasecmp(s, "nofailover")) { n->flags |= CLUSTER_NODE_NOFAILOVER; } else if (!strcasecmp(s, "noflags")) { @@ -3250,13 +3250,20 @@ int clusterProcessPacket(clusterLink *link) { "having flags %d", link->node->name, link->node->human_nodename, link->node->shard_id, (int)(now - (link->node->ctime)), link->node->flags); - link->node->flags |= CLUSTER_NODE_NOADDR; + /* We also mark the node as fail because we have disconnected from it, + * and will not reconnect, and obviously we will not gossip NOADDR nodes. + * Marking it as FAIL can help us advance the state, such as the cluster + * state becomes FAIL or the replica can do the failover. Otherwise, the + * NOADDR node will provide an invalid address in redirection and confuse + * the client, and the replica will not initiate a failover since the node + * is not actually in FAIL state. */ + link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); link->node->ip[0] = '\0'; link->node->tcp_port = 0; link->node->tls_port = 0; link->node->cport = 0; freeClusterLink(link); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE); return 0; } } diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl new file mode 100644 index 0000000000..5516fc8303 --- /dev/null +++ b/tests/unit/cluster/noaddr.tcl @@ -0,0 +1,40 @@ +start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} { + test "NOADDR nodes will be marked as FAIL" { + set primary0_id [R 0 CLUSTER MYID] + + # R 0 is a primary, doing a CLUSTER RESET and the node name will be modified, + # and other nodes will set it to NOADDR. + R 0 cluster reset hard + wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10 + assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] 1 + + # Also we will set the node to FAIL, so the cluster will eventually be down. + assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] 1 + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} && + [CI 3 cluster_state] eq {fail} && + [CI 4 cluster_state] eq {fail} && + [CI 5 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + + # key_977613 belong to slot 0 and belong to R 0. + # Make sure we get a CLUSTER DOWN instead of an invalid MOVED. + assert_error {CLUSTERDOWN*} {R 1 set key_977613 bar} + + # Let the replica 3 do the failover. + R 3 config set cluster-replica-validity-factor 0 + R 3 config set cluster-replica-no-failover no + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} && + [CI 3 cluster_state] eq {ok} && + [CI 4 cluster_state] eq {ok} && + [CI 5 cluster_state] eq {ok} + } else { + fail "Cluster doesn't ok" + } + } +} ;# start_cluster