Skip to content

Commit

Permalink
Use membership instead of cache in disk_failure issues
Browse files Browse the repository at this point in the history
  • Loading branch information
yngvar-antonsson committed Aug 15, 2024
1 parent 2b4f5ed commit a1e1805
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 67 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ Changed

- uuids in issues replaces with instance names and uris.

- Use ``membership`` instead of cache in ``disk_failure`` issues

-------------------------------------------------------------------------------
[2.12.2] - 2024-06-24
-------------------------------------------------------------------------------
Expand Down
45 changes: 18 additions & 27 deletions cartridge/issues.lua
Original file line number Diff line number Diff line change
Expand Up @@ -530,17 +530,8 @@ local function list_on_instance(opts)
end
end

if type(box.cfg) == 'table' and not fio.lstat(box.cfg.memtx_dir) then
table.insert(ret, {
level = 'critical',
topic = 'disk_failure',
instance_uuid = instance_uuid,
replicaset_uuid = replicaset_uuid,
message = string.format(
'Disk error on instance %s. This issue stays until restart',
describe(self_uri)
),
})
if type(box.cfg) == 'table' then
membership.set_payload('disk_failure', not fio.lstat(box.cfg.memtx_dir))
end

-- add custom issues from each role
Expand All @@ -564,7 +555,6 @@ local function list_on_instance(opts)
return ret
end

local disk_failure_cache = {}
local function list_on_cluster()
local state, err = confapplier.get_state()
if state == 'Unconfigured' and lua_api_proxy.can_call() then
Expand Down Expand Up @@ -699,8 +689,8 @@ local function list_on_cluster()
end
end

-- Check aliens in membership and unrecoverable instances
local unrecoverable_uuids = {}
-- Check aliens in membership, unrecoverable instances and disk_failures
local uuids_to_disable = {}
for uri, member in membership.pairs() do
local uuid = member.payload.uuid
if member.status == 'alive'
Expand Down Expand Up @@ -731,7 +721,7 @@ local function list_on_cluster()

::uuid_found::
if uuid ~= nil then -- still no uuid, skipping
table.insert(unrecoverable_uuids, uuid)
table.insert(uuids_to_disable, uuid)
table.insert(ret, {
level = 'warning',
topic = 'autodisable',
Expand All @@ -744,6 +734,19 @@ local function list_on_cluster()
})
end
end

if member.payload.disk_failure then
table.insert(ret, {
level = 'critical',
topic = 'disk_failure',
instance_uuid = uuid,
message = string.format(
'Disk error on instance %s',
describe(uri)
),
})
table.insert(uuids_to_disable, uuid)
end
end

-- Get each instance issues (replication, failover, memory usage)
Expand All @@ -760,24 +763,12 @@ local function list_on_cluster()
{uri_list = uri_list, timeout = 1}
)

local uuids_to_disable = {}
for _, issues in pairs(issues_map) do
for _, issue in pairs(issues) do
table.insert(ret, issue)
if issue.topic == 'disk_failure' then
table.insert(uuids_to_disable, issue.instance_uuid)
disk_failure_cache[issue.instance_uuid] = issue
end
end
end

for _, issue in pairs(disk_failure_cache) do
table.insert(ret, issue)
end

if vars.disable_unrecoverable then
uuids_to_disable = fun.chain(uuids_to_disable, unrecoverable_uuids):totable()
end
if #uuids_to_disable > 0 then
lua_api_topology.disable_servers(uuids_to_disable)
end
Expand Down
75 changes: 35 additions & 40 deletions test/integration/disk_failure_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,10 @@ function g.test_disk_failure_disable()
local expected_issues = {
{
level = 'critical',
replicaset_uuid = sharded_storage_1.replicaset_uuid,
instance_uuid = sharded_storage_1.instance_uuid,
topic = 'disk_failure',
}, {
level = 'critical',
replicaset_uuid = simple_storage_1.replicaset_uuid,
instance_uuid = simple_storage_1.instance_uuid,
topic = 'disk_failure',
}
Expand All @@ -79,41 +77,44 @@ function g.test_disk_failure_disable()
t.assert_covers(issues, expected_issues)
end)

local resp = router:graphql({
query = [[
{
servers {
uri
disabled
t.helpers.retrying({}, function()
local resp = router:graphql({
query = [[
{
servers {
uri
disabled
}
}
}
]]
})
]]
})

table.sort(resp['data']['servers'], function(a, b) return a.uri < b.uri end)
table.sort(resp['data']['servers'], function(a, b) return a.uri < b.uri end)

t.assert_items_equals(resp['data']['servers'], {
{
uri = 'localhost:13301',
disabled = false,
},
{
uri = 'localhost:13302',
disabled = true,
},
{
uri = 'localhost:13303',
disabled = false,
},
{
uri = 'localhost:13304',
disabled = true,
},
{
uri = 'localhost:13305',
disabled = false,
},
})
end)

t.assert_items_equals(resp['data']['servers'], {
{
uri = 'localhost:13301',
disabled = false,
},
{
uri = 'localhost:13302',
disabled = true,
},
{
uri = 'localhost:13303',
disabled = false,
},
{
uri = 'localhost:13304',
disabled = true,
},
{
uri = 'localhost:13305',
disabled = false,
},
})
-- first storage is disabled
t.assert_not(sharded_storage_1:exec(function()
return _G.vshard.storage.internal.is_enabled
Expand All @@ -138,12 +139,6 @@ function g.test_disk_failure_disable()
}
]]):format(sharded_storage_1.instance_uuid, simple_storage_1.instance_uuid)})

-- restart router to remove issues
router:restart()
t.helpers.retrying({}, function()
t.assert_equals(helpers.list_cluster_issues(router), {})
end)

-- vshard is enabled again
t.assert(sharded_storage_1:exec(function()
return _G.vshard.storage.internal.is_enabled
Expand Down

0 comments on commit a1e1805

Please sign in to comment.