diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0918a3b35..6e3f58045 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -36,6 +36,8 @@ Fixed - Fix false positive warning in migrations UI. +- Leader autoreturn doesn't try to return leadership to unhealthy leader anymore. + ------------------------------------------------------------------------------- [2.10.0] - 2024-04-10 ------------------------------------------------------------------------------- diff --git a/cartridge/failover/leader_autoreturn.lua b/cartridge/failover/leader_autoreturn.lua index aac2addfb..80e24f986 100644 --- a/cartridge/failover/leader_autoreturn.lua +++ b/cartridge/failover/leader_autoreturn.lua @@ -22,6 +22,14 @@ local function enable(topology_cfg) if desired_leader_uuid ~= vars.instance_uuid then log.info("Autoreturn: try to return leader %s in replicaset %s", desired_leader_uuid, vars.replicaset_uuid) + local ok, _ = topology.member_is_healthy( + topology_cfg.servers[desired_leader_uuid].uri, + desired_leader_uuid + ) + if ok == nil then + log.error("Autoreturn: prime leader is unhealthy") + goto continue + end local client = vars.client if client == nil or client.session == nil @@ -50,6 +58,7 @@ local function enable(topology_cfg) end end end + ::continue:: end end diff --git a/rst/topics/failover.rst b/rst/topics/failover.rst index dd1a170f7..5ce1c0286 100644 --- a/rst/topics/failover.rst +++ b/rst/topics/failover.rst @@ -162,7 +162,8 @@ You can also enable ``leader_autoreturn`` to return leadership to the first leader in ``failover_priority`` list after failover was triggered. It might be useful when you have active and passive data centers. The time before failover will try to return the leader is configured by -``autoreturn_delay`` option in a failover configuration. +``autoreturn_delay`` option in a failover configuration. Note that +``leader_autoreturn`` won't work if the prime leader is unhealthy. Stateful failover automatically checks if there is a registered cluster in a state provider. Check is performed on a first stateful failover diff --git a/test/integration/failover_stateful_autoreturn_test.lua b/test/integration/failover_stateful_autoreturn_test.lua index 03d017d61..6a2e9d7f3 100644 --- a/test/integration/failover_stateful_autoreturn_test.lua +++ b/test/integration/failover_stateful_autoreturn_test.lua @@ -1,4 +1,5 @@ local fio = require('fio') +local fiber = require('fiber') local t = require('luatest') local helpers = require('test.helper') @@ -168,7 +169,7 @@ local function check_fiber(g, server_name, present) end, {present}) end -add('test_stateful_failover_autoreturn_fiber_present', function(g) +add('test_fiber_present', function(g) check_fiber(g, 'coordinator', false) for _, v in ipairs{'leader', 'replica'} do check_fiber(g, v, true) @@ -191,8 +192,8 @@ add('test_stateful_failover_autoreturn', function(g) end) end) -for _, server in ipairs{'coordinator', 'leader'} do - add('test_stateful_failover_autoreturn_fails_no_' .. server, function(g) +for _, server in ipairs({'coordinator', 'leader'}) do + add('test_fails_no_' .. server, function(g) helpers.retrying({}, function() local ok, err = g.cluster.main_server:eval(q_promote, {{[storage1_uuid] = storage1_2_uuid}}) t.assert(ok, err) @@ -234,7 +235,7 @@ local function set_autoreturn(g, leader_autoreturn) end end -add('test_stateful_failover_autoreturn_disable_no_fibers', function(g) +add('test_disable_no_fibers', function(g) set_autoreturn(g, false) for _, v in ipairs{'coordinator', 'leader', 'replica'} do check_fiber(g, v, false) @@ -242,3 +243,33 @@ add('test_stateful_failover_autoreturn_disable_no_fibers', function(g) set_autoreturn(g, true) end) +add('test_failed_no_prime', function(g) + helpers.retrying({}, function() + local ok, err = g.cluster.main_server:eval(q_promote, {{[storage1_uuid] = storage1_2_uuid}}) + t.assert(ok, err) + end) + + helpers.retrying({}, function() + t.assert_equals(g.cluster.main_server:eval(q_leadership), storage1_2_uuid) + end) + + g.cluster:server('replica'):exec(function(uri) + local memberhsip = require('membership') + rawset(_G, '__get_member_prev', memberhsip.get_member) + package.loaded['membership'].get_member = function(advertise_uri) + local res = _G.__get_member_prev(uri) + if uri == advertise_uri then + res.status = 'unhealthy' + end + return res + end + end, {g.cluster:server('replica').advertise_uri}) + fiber.sleep(5) -- enough to wait autoreturn fiber + + t.assert_not_equals(g.cluster.main_server:eval(q_leadership), storage1_1_uuid) + t.assert_equals(g.cluster.main_server:eval(q_leadership), storage1_2_uuid) + + g.cluster:server('replica'):exec(function() + package.loaded['membership'].get_member = rawget(_G, __get_member_prev) + end) +end)