diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index c2dacd311246..2b459c6925cb 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -563,6 +563,12 @@ bool TCms::CheckEvictVDisks(const TAction &action, TErrorInfo &error) const { return false; } + if (State->Config.SentinelConfig.EvictVDisksStatus.Empty()) { + error.Code = TStatus::ERROR; + error.Reason = "Evict vdisks is disabled in Sentinel (self heal)"; + return false; + } + switch (action.GetType()) { case TAction::RESTART_SERVICES: case TAction::SHUTDOWN_HOST: diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp index 81c5914ffa59..d9050fd8eb7e 100644 --- a/ydb/core/cms/cms_ut.cpp +++ b/ydb/core/cms/cms_ut.cpp @@ -1951,6 +1951,46 @@ Y_UNIT_TEST_SUITE(TCmsTest) { env.CheckDonePermission("user", permission2.GetPermissions(0).GetId()); } + Y_UNIT_TEST(DisabledEvictVDisks) + { + auto opts = TTestEnvOpts(8).WithSentinel(); + TCmsTestEnv env(opts); + env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG); + + // Make transition faster for tests purposes + auto cmsConfig = env.GetCmsConfig(); + cmsConfig.MutableSentinelConfig()->SetDefaultStateLimit(1); + env.SetCmsConfig(cmsConfig); + + // Evict VDisks + auto request = env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage") + ), + TStatus::DISALLOW_TEMP // ok, waiting for move VDisks + ); + + // Check that FAULTY BSC request is sent + env.CheckBSCUpdateRequests({ env.GetNodeId(0) }, NKikimrBlobStorage::FAULTY); + + // Disable VDisks eviction + cmsConfig.MutableSentinelConfig()->SetEvictVDisksStatus(NKikimrCms::TCmsConfig::TSentinelConfig::DISABLED); + env.SetCmsConfig(cmsConfig); + + // Check that ACTIVE BSC request is sent + env.CheckBSCUpdateRequests({ env.GetNodeId(0) }, NKikimrBlobStorage::ACTIVE); + + // Check that CMS returns ERROR when VDisks eviction is disabled + env.CheckRequest("user", request.GetRequestId(), false, TStatus::ERROR, 0); + + // Enable VDisks eviction again + cmsConfig.MutableSentinelConfig()->SetEvictVDisksStatus(NKikimrCms::TCmsConfig::TSentinelConfig::FAULTY); + env.SetCmsConfig(cmsConfig); + + // Check that FAULTY BSC request is sent again + env.CheckBSCUpdateRequests({ env.GetNodeId(0) }, NKikimrBlobStorage::FAULTY); + } + Y_UNIT_TEST(EmergencyDuringRollingRestart) { TCmsTestEnv env(8); diff --git a/ydb/core/cms/config.h b/ydb/core/cms/config.h index 6eeb5407111a..4f2f6a7259b9 100644 --- a/ydb/core/cms/config.h +++ b/ydb/core/cms/config.h @@ -1,12 +1,14 @@ #pragma once #include "pdisk_state.h" +#include "pdisk_status.h" #include #include #include #include +#include namespace NKikimr::NCms { @@ -30,6 +32,8 @@ struct TCmsSentinelConfig { ui32 RoomRatio; ui32 RackRatio; + TMaybeFail EvictVDisksStatus; + void Serialize(NKikimrCms::TCmsConfig::TSentinelConfig &config) const { config.SetEnable(Enable); config.SetDryRun(DryRun); @@ -45,6 +49,7 @@ struct TCmsSentinelConfig { config.SetRackRatio(RackRatio); SaveStateLimits(config); + SaveEvictVDisksStatus(config); } void Deserialize(const NKikimrCms::TCmsConfig::TSentinelConfig &config) { @@ -63,6 +68,8 @@ struct TCmsSentinelConfig { auto newStateLimits = LoadStateLimits(config); StateLimits.swap(newStateLimits); + + EvictVDisksStatus = LoadEvictVDisksStatus(config); } void SaveStateLimits(NKikimrCms::TCmsConfig::TSentinelConfig &config) const { @@ -129,6 +136,31 @@ struct TCmsSentinelConfig { return stateLimits; } + + static TMaybeFail LoadEvictVDisksStatus(const NKikimrCms::TCmsConfig::TSentinelConfig &config) { + using EEvictVDisksStatus = NKikimrCms::TCmsConfig::TSentinelConfig; + switch (config.GetEvictVDisksStatus()) { + case EEvictVDisksStatus::UNKNOWN: + case EEvictVDisksStatus::FAULTY: + return EPDiskStatus::FAULTY; + case EEvictVDisksStatus::DISABLED: + return Nothing(); + } + return EPDiskStatus::FAULTY; + } + + void SaveEvictVDisksStatus(NKikimrCms::TCmsConfig::TSentinelConfig &config) const { + using EEvictVDisksStatus = NKikimrCms::TCmsConfig::TSentinelConfig; + + if (EvictVDisksStatus.Empty()) { + config.SetEvictVDisksStatus(EEvictVDisksStatus::DISABLED); + return; + } + + if (*EvictVDisksStatus == EPDiskStatus::FAULTY) { + config.SetEvictVDisksStatus(EEvictVDisksStatus::FAULTY); + } + } }; struct TCmsLogConfig { diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp index cd06c2f65003..c99e236d4a16 100644 --- a/ydb/core/cms/sentinel.cpp +++ b/ydb/core/cms/sentinel.cpp @@ -895,8 +895,8 @@ class TSentinel: public TActorBootstrapped { continue; } - if (it->second.HasFaultyMarker()) { - info.SetForcedStatus(EPDiskStatus::FAULTY); + if (it->second.HasFaultyMarker() && Config.EvictVDisksStatus.Defined()) { + info.SetForcedStatus(*Config.EvictVDisksStatus); } else { info.ResetForcedStatus(); } diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto index f91f2c0742de..9abd797c9592 100644 --- a/ydb/core/protos/cms.proto +++ b/ydb/core/protos/cms.proto @@ -430,6 +430,12 @@ message TCmsConfig { optional uint32 Limit = 2; } + enum EEvictVDisksStatus { + UNKNOWN = 0; + DISABLED = 1; + FAULTY = 2; + } + optional bool Enable = 1 [default = true]; // Updater's config optional uint64 UpdateConfigInterval = 2 [default = 3600000000]; @@ -449,6 +455,7 @@ message TCmsConfig { optional bool DryRun = 13; repeated TStateLimit StateLimits = 14; + optional EEvictVDisksStatus EvictVDisksStatus = 15; } message TLogConfig {