From 79b0557b298165808ee77b3f0bfa6f83b65489ca Mon Sep 17 00:00:00 2001 From: Igor Zolotarev Date: Fri, 13 Sep 2024 20:41:16 +0300 Subject: [PATCH] Add doubled buckets issue --- CHANGELOG.rst | 2 + cartridge.lua | 3 + cartridge/issues.lua | 35 +++++++++ cartridge/vshard-utils.lua | 45 ++++++++++++ rst/cartridge_admin.rst | 8 +++ .../vshard_doubled_buckets_test.lua | 72 +++++++++++++++++++ 6 files changed, 165 insertions(+) create mode 100644 test/integration/vshard_doubled_buckets_test.lua diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a32f65d70..6676af237 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -18,6 +18,8 @@ Added - Rocks versions are shown in the WebUI. +- New issue about doubled buckets (can be enabled with TARANTOOL_CHECK_DOUBLED_BUCKETS=true). + ------------------------------------------------------------------------------- [2.12.3] - 2024-08-16 ------------------------------------------------------------------------------- diff --git a/cartridge.lua b/cartridge.lua index 6684bbb5d..1064c38e1 100644 --- a/cartridge.lua +++ b/cartridge.lua @@ -874,6 +874,8 @@ local function cfg(opts, box_opts) local res, err = argparse.get_opts({ disable_unrecoverable_instances = 'boolean', + check_doubled_buckets = 'boolean', + check_doubled_buckets_period = 'number', }) if err ~= nil then @@ -881,6 +883,7 @@ local function cfg(opts, box_opts) end issues.disable_unrecoverable(res.disable_unrecoverable_instances) + issues.check_doubled_buckets(res.check_doubled_buckets, res.check_doubled_buckets_period) if opts.upload_prefix ~= nil then local path = opts.upload_prefix diff --git a/cartridge/issues.lua b/cartridge/issues.lua index 942333f77..8988d5863 100644 --- a/cartridge/issues.lua +++ b/cartridge/issues.lua @@ -50,6 +50,7 @@ -- * various vshard alerts (see vshard docs for details); -- * warning: "Group "..." wasn't bootstrapped: ..."; -- * warning: Vshard storages in replicaset %s marked as "all writable". +-- * warning: "Cluster has ... doubled buckets. Call require('cartridge.vshard-utils').find_doubled_buckets() for details"; -- You can enable extra vshard issues by setting -- `TARANTOOL_ADD_VSHARD_STORAGE_ALERTS_TO_ISSUES=true/TARANTOOL_ADD_VSHARD_ROUTER_ALERTS_TO_ISSUES=true` -- or with `--add-vshard-storage-alerts-to-issues/--add-vshard-router-alerts-to-issues` command-line argument. @@ -125,6 +126,7 @@ local lua_api_proxy = require('cartridge.lua-api.proxy') local lua_api_topology = require('cartridge.lua-api.topology') local invalid_format = require('cartridge.invalid-format') local sync_spaces = require('cartridge.sync-spaces') +local vshard_utils = require('cartridge.vshard-utils') local ValidateConfigError = errors.new_class('ValidateConfigError') @@ -154,6 +156,9 @@ local limits_ranges = { vars:new('limits', default_limits) vars:new('disable_unrecoverable', false) +vars:new('check_doubled_buckets', false) +vars:new('check_doubled_buckets_period', 24*60*60) -- 24 hours + vars:new('instance_uuid') vars:new('replicaset_uuid') @@ -565,6 +570,8 @@ local function list_on_instance(opts) end local disk_failure_cache = {} +local doubled_buckets_count_cache = 0 +local last_doubled_buckets_check = fiber.time() local function list_on_cluster() local state, err = confapplier.get_state() if state == 'Unconfigured' and lua_api_proxy.can_call() then @@ -746,6 +753,28 @@ local function list_on_cluster() end end + if vars.check_doubled_buckets == true + and last_doubled_buckets_check + vars.check_doubled_buckets_period > fiber.time() + then + local doubled_buckets = vshard_utils.find_doubled_buckets() or {} + doubled_buckets_count_cache = 0 + for _ in pairs(doubled_buckets) do + doubled_buckets_count_cache = doubled_buckets_count_cache + 1 + end + last_doubled_buckets_check = fiber.time() + end + + if doubled_buckets_count_cache > 0 then + table.insert(ret, { + level = 'warning', + topic = 'vshard', + message = string.format( + "Cluster has %d doubled buckets. " .. + "Call require('cartridge.vshard-utils').find_doubled_buckets() for details", + doubled_buckets_count_cache + ) + }) + end -- Get each instance issues (replication, failover, memory usage) local twophase_vars = require('cartridge.vars').new('cartridge.twophase') @@ -859,4 +888,10 @@ return { disable_unrecoverable = function(disable) vars.disable_unrecoverable = disable end, + check_doubled_buckets = function(check, period) + vars.check_doubled_buckets = check + if period ~= nil then + vars.check_doubled_buckets_period = period + end + end, } diff --git a/cartridge/vshard-utils.lua b/cartridge/vshard-utils.lua index 1b7078c74..3b2c152ad 100644 --- a/cartridge/vshard-utils.lua +++ b/cartridge/vshard-utils.lua @@ -617,6 +617,50 @@ local function can_bootstrap_group(group_name, vsgroup) return true end +-- see https://github.com/tarantool/vshard/issues/412 for details +local function find_doubled_buckets() + if roles.get_role('vshard-router') == nil then + return false + end + local vshard = require('vshard') + + local BUCKET_COUNT = vshard.router.bucket_count() + local all_buckets = {} + for id = 1, BUCKET_COUNT do + all_buckets[id] = { + count = 0, + info = {}, + uuids = {}, + } + end + + local routes = vshard.router.routeall() + for _, replicaset in pairs(routes) do + local buckets, err = replicaset:callro( + 'vshard.storage.buckets_info', {}, {timeout = 5} + ) + if err then + return nil, err + end + + for id, bucket in pairs(buckets) do + all_buckets[id].count = all_buckets[id].count + 1 + table.insert(all_buckets[id].uuids, replicaset.uuid) + table.insert(all_buckets[id].info, bucket) + end + end + + local intersection = {} + for id = 1, BUCKET_COUNT do + if all_buckets[id].count > 1 then + intersection[id] = all_buckets[id] + end + end + + return intersection + end + + local function can_bootstrap() if roles.get_role('vshard-router') == nil then return false @@ -764,6 +808,7 @@ return { can_bootstrap = can_bootstrap, edit_vshard_options = edit_vshard_options, patch_zone_distances = patch_zone_distances, + find_doubled_buckets = find_doubled_buckets, init = init, } diff --git a/rst/cartridge_admin.rst b/rst/cartridge_admin.rst index 509264fa6..d0fa69022 100644 --- a/rst/cartridge_admin.rst +++ b/rst/cartridge_admin.rst @@ -1512,6 +1512,14 @@ Cartridge displays cluster and instances issues in WebUI: * **warning**: "Vshard storages in replicaset ... marked as "all writable". You can fix it by setting ``all_rw = false`` in the replicaset configuration; + * **warning**: "Cluster has ... doubled buckets. Call require('cartridge.vshard-utils').find_doubled_buckets() for details" + -- you need to call ``require('cartridge.vshard-utils').find_doubled_buckets()`` to get more info + and then remove all duplicated data manually and then use ``vshard.storage.bucket_force_drop(bucket_id)`` + to remove the bucket. See https://github.com/tarantool/vshard/issues/412 for details. + This issue is disabled by default. You can enable it by setting + ``TARANTOOL_CHECK_DOUBLED_BUCKETS=true`` and then chech will run once a + ``TARANTOOL_CHECK_DOUBLED_BUCKETS_PERIOD`` (default is 24 hours); + You can enable extra vshard issues by setting ``TARANTOOL_ADD_VSHARD_STORAGE_ALERTS_TO_ISSUES=true/TARANTOOL_ADD_VSHARD_ROUTER_ALERTS_TO_ISSUES=true`` or with ``--add-vshard-storage-alerts-to-issues/--add-vshard-router-alerts-to-issues`` command-line argument. diff --git a/test/integration/vshard_doubled_buckets_test.lua b/test/integration/vshard_doubled_buckets_test.lua new file mode 100644 index 000000000..06f51bbbd --- /dev/null +++ b/test/integration/vshard_doubled_buckets_test.lua @@ -0,0 +1,72 @@ +local fio = require('fio') +local t = require('luatest') +local netbox = require('net.box') +local g = t.group() + +local helpers = require('test.helper') + +g.before_all = function() + g.cluster = helpers.Cluster:new({ + datadir = fio.tempdir(), + server_command = helpers.entrypoint('srv_basic'), + cookie = helpers.random_cookie(), + use_vshard = true, + replicasets = { + { + alias = 'router', + roles = {'vshard-router'}, + servers = 1, + }, + { + alias = 'storage-1', + roles = {'vshard-storage'}, + servers = 1, + }, + { + alias = 'storage-2', + roles = {'vshard-storage'}, + servers = 1, + }, + }, + env = { + TARANTOOL_CHECK_DOUBLED_BUCKETS = 'true', + TARANTOOL_CHECK_DOUBLED_BUCKETS_PERIOD = '10', + }, + }) + g.cluster:start() +end + +g.after_all = function() + g.cluster:stop() + fio.rmtree(g.cluster.datadir) +end + +function g.test_doubled_buckets() + local bucket = g.cluster:server('storage-2-1'):exec(function() + return box.space._bucket:select(nil, {limit = 1})[1] + end) + + g.cluster:server('storage-1-1'):exec(function(bucket) + box.space._bucket:run_triggers(false) + return box.space._bucket:insert(bucket) + end, {bucket}) + + t.helpers.retrying({timeout = 20}, function() + t.assert_covers(helpers.list_cluster_issues(g.cluster.main_server), { + { + level = 'warning', + topic = 'vshard', + message = "Cluster has 1 doubled buckets. " .. + "Call require('cartridge.vshard-utils').find_doubled_buckets() for details", + }, + }) + end) + + g.cluster:server('storage-1-1'):exec(function(bucket) + return box.space._bucket:delete(bucket[1]) + end, {bucket}) + + t.helpers.retrying({timeout = 20}, function() + t.assert_covers(helpers.list_cluster_issues(g.cluster.main_server), {}) + end) +end