From c894fc0ee74100298f5a2bf5995046d394fb203e Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 17 Oct 2024 20:55:26 +0800
Subject: [PATCH] Print logs when the cluster state changes to fail or the fail
 reason changes

This log allows us to easily distinguish between full coverage and
minority partition when the cluster fails. Sometimes it is not easy
to see the minority partition in a healthy shards (both primary and
replicas).

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster.h        |  5 +++++
 src/cluster_legacy.c | 27 ++++++++++++++++++++++++++-
 src/cluster_legacy.h |  1 +
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/cluster.h b/src/cluster.h
index 2e4f33a3c9..a09ce19493 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -12,6 +12,11 @@
 #define CLUSTER_FAIL 1                                              /* The cluster can't work */
 #define CLUSTER_NAMELEN 40                                          /* sha1 hex length */
 
+/* Reason why the cluster state changes to fail. */
+#define CLUSTER_FAIL_NONE 0
+#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
+#define CLUSTER_FAIL_MINORITY_PARTITION 2
+
 /* Redirection errors returned by getNodeByQuery(). */
 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
 #define CLUSTER_REDIR_CROSS_SLOT 1    /* -CROSSSLOT request. */
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 14f8a6bd1e..d7c67c0a8b 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1082,6 +1082,7 @@ void clusterInit(void) {
     server.cluster->myself = NULL;
     server.cluster->currentEpoch = 0;
     server.cluster->state = CLUSTER_FAIL;
+    server.cluster->fail_reason = CLUSTER_FAIL_NONE;
     server.cluster->size = 0;
     server.cluster->todo_before_sleep = 0;
     server.cluster->nodes = dictCreate(&clusterNodesDictType);
@@ -5282,6 +5283,21 @@ void clusterCloseAllSlots(void) {
  * Cluster state evaluation function
  * -------------------------------------------------------------------------- */
 
+void clusterLogWhyFail(int reason) {
+    char *msg;
+    switch (reason) {
+    case CLUSTER_FAIL_NOT_FULL_COVERAGE:
+        msg = "Detect there is at least a hash slot uncovered (no available node is serving it). "
+              "Please check the 'cluster-require-full-coverage' configuration option.";
+        break;
+    case CLUSTER_FAIL_MINORITY_PARTITION:
+        msg = "In a minority partition."; break;
+    default: msg = "Unknown reason code."; break;
+    }
+    serverLog(LL_NOTICE, "Currently cluster unable to work: %s", msg);
+    server.cluster->fail_reason = reason;
+}
+
 /* The following are defines that are only used in the evaluation function
  * and are based on heuristics. Actually the main point about the rejoin and
  * writable delay is that they should be a few orders of magnitude larger
@@ -5291,7 +5307,7 @@ void clusterCloseAllSlots(void) {
 #define CLUSTER_WRITABLE_DELAY 2000
 
 void clusterUpdateState(void) {
-    int j, new_state;
+    int j, new_state, new_reason;
     int reachable_primaries = 0;
     static mstime_t among_minority_time;
     static mstime_t first_call_time = 0;
@@ -5312,12 +5328,14 @@ void clusterUpdateState(void) {
     /* Start assuming the state is OK. We'll turn it into FAIL if there
      * are the right conditions. */
     new_state = CLUSTER_OK;
+    new_reason = CLUSTER_FAIL_NONE;
 
     /* Check if all the slots are covered. */
     if (server.cluster_require_full_coverage) {
         for (j = 0; j < CLUSTER_SLOTS; j++) {
             if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) {
                 new_state = CLUSTER_FAIL;
+                new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE;
                 break;
             }
         }
@@ -5352,6 +5370,7 @@ void clusterUpdateState(void) {
 
         if (reachable_primaries < needed_quorum) {
             new_state = CLUSTER_FAIL;
+            new_reason = CLUSTER_FAIL_MINORITY_PARTITION;
             among_minority_time = mstime();
         }
     }
@@ -5375,7 +5394,13 @@ void clusterUpdateState(void) {
         serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s",
                   new_state == CLUSTER_OK ? "ok" : "fail");
         server.cluster->state = new_state;
+
+        /* Cluster state changes from ok to fail, print a log. */
+        if (new_state == CLUSTER_FAIL) clusterLogWhyFail(new_reason);
     }
+
+    /* Cluster state is still fail, but the reason has changed, print a log. */
+    if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) clusterLogWhyFail(new_reason);
 }
 
 /* This function is called after the node startup in order to verify that data
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 5280644e6e..8bfa8205ae 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -368,6 +368,7 @@ struct clusterState {
     clusterNode *myself; /* This node */
     uint64_t currentEpoch;
     int state;              /* CLUSTER_OK, CLUSTER_FAIL, ... */
+    int fail_reason;        /* Why the cluster state changes to fail. */
     int size;               /* Num of primary nodes with at least one slot */
     dict *nodes;            /* Hash table of name -> clusterNode structures */
     dict *shards;           /* Hash table of shard_id -> list (of nodes) structures */