Skip to content

Commit

Permalink
Forbid to autoreturn an unhealthy leader (#2235)
Browse files Browse the repository at this point in the history
  • Loading branch information
yngvar-antonsson authored May 13, 2024
1 parent 6bc232d commit 51a6a12
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Fixed

- Fix false positive warning in migrations UI.

- Leader autoreturn doesn't try to return leadership to unhealthy leader anymore.

-------------------------------------------------------------------------------
[2.10.0] - 2024-04-10
-------------------------------------------------------------------------------
Expand Down
9 changes: 9 additions & 0 deletions cartridge/failover/leader_autoreturn.lua
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ local function enable(topology_cfg)
if desired_leader_uuid ~= vars.instance_uuid then
log.info("Autoreturn: try to return leader %s in replicaset %s",
desired_leader_uuid, vars.replicaset_uuid)
local ok, _ = topology.member_is_healthy(
topology_cfg.servers[desired_leader_uuid].uri,
desired_leader_uuid
)
if ok == nil then
log.error("Autoreturn: prime leader is unhealthy")
goto continue
end
local client = vars.client
if client == nil
or client.session == nil
Expand Down Expand Up @@ -50,6 +58,7 @@ local function enable(topology_cfg)
end
end
end
::continue::
end
end

Expand Down
3 changes: 2 additions & 1 deletion rst/topics/failover.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ You can also enable ``leader_autoreturn`` to return leadership to the
first leader in ``failover_priority`` list after failover was triggered.
It might be useful when you have active and passive data centers.
The time before failover will try to return the leader is configured by
``autoreturn_delay`` option in a failover configuration.
``autoreturn_delay`` option in a failover configuration. Note that
``leader_autoreturn`` won't work if the prime leader is unhealthy.

Stateful failover automatically checks if there is a registered cluster
in a state provider. Check is performed on a first stateful failover
Expand Down
39 changes: 35 additions & 4 deletions test/integration/failover_stateful_autoreturn_test.lua
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
local fio = require('fio')
local fiber = require('fiber')
local t = require('luatest')
local helpers = require('test.helper')

Expand Down Expand Up @@ -168,7 +169,7 @@ local function check_fiber(g, server_name, present)
end, {present})
end

add('test_stateful_failover_autoreturn_fiber_present', function(g)
add('test_fiber_present', function(g)
check_fiber(g, 'coordinator', false)
for _, v in ipairs{'leader', 'replica'} do
check_fiber(g, v, true)
Expand All @@ -191,8 +192,8 @@ add('test_stateful_failover_autoreturn', function(g)
end)
end)

for _, server in ipairs{'coordinator', 'leader'} do
add('test_stateful_failover_autoreturn_fails_no_' .. server, function(g)
for _, server in ipairs({'coordinator', 'leader'}) do
add('test_fails_no_' .. server, function(g)
helpers.retrying({}, function()
local ok, err = g.cluster.main_server:eval(q_promote, {{[storage1_uuid] = storage1_2_uuid}})
t.assert(ok, err)
Expand Down Expand Up @@ -234,11 +235,41 @@ local function set_autoreturn(g, leader_autoreturn)
end
end

add('test_stateful_failover_autoreturn_disable_no_fibers', function(g)
add('test_disable_no_fibers', function(g)
set_autoreturn(g, false)
for _, v in ipairs{'coordinator', 'leader', 'replica'} do
check_fiber(g, v, false)
end
set_autoreturn(g, true)
end)

add('test_failed_no_prime', function(g)
helpers.retrying({}, function()
local ok, err = g.cluster.main_server:eval(q_promote, {{[storage1_uuid] = storage1_2_uuid}})
t.assert(ok, err)
end)

helpers.retrying({}, function()
t.assert_equals(g.cluster.main_server:eval(q_leadership), storage1_2_uuid)
end)

g.cluster:server('replica'):exec(function(uri)
local memberhsip = require('membership')
rawset(_G, '__get_member_prev', memberhsip.get_member)
package.loaded['membership'].get_member = function(advertise_uri)
local res = _G.__get_member_prev(uri)
if uri == advertise_uri then
res.status = 'unhealthy'
end
return res
end
end, {g.cluster:server('replica').advertise_uri})
fiber.sleep(5) -- enough to wait autoreturn fiber

t.assert_not_equals(g.cluster.main_server:eval(q_leadership), storage1_1_uuid)
t.assert_equals(g.cluster.main_server:eval(q_leadership), storage1_2_uuid)

g.cluster:server('replica'):exec(function()
package.loaded['membership'].get_member = rawget(_G, '__get_member_prev')
end)
end)

0 comments on commit 51a6a12

Please sign in to comment.