From 158dfc1b1760121259c9bde33af57f4419e3625b Mon Sep 17 00:00:00 2001 From: yiming <107105845+nshangyiming@users.noreply.github.com> Date: Mon, 3 Jul 2023 12:05:30 +0800 Subject: [PATCH] [Enhancement] Optimize the root path choosing logic on tablet creation (#26238) Fixes SR-18615 Returned a vector contains all the storage root paths and the vector is sorted by the disk usage in asc order, then the front portion of the vector excluding paths which have high disk usage is shuffled to avoid the newly created tablet is distributed on only on specific path. This is an enhancement to pr #20833, in the original logic, vector is sorted by available capacity in desc order and then shuffle half of the vector, this has two problems: 1. different disk may have different total capacity, we should sort the path based on usage not the available bytes. 2. if the disk usage of the root paths is similar, this will cause half of disks have have unbalanced number of tablet. Signed-off-by: Dejun Xia (cherry picked from commit dbf5fc61e8ba8d2d6cf58bc143c290d41e5a7663) # Conflicts: # be/src/common/config.h --- be/src/common/config.h | 22 +++++++++++++++------- be/src/storage/data_dir.cpp | 2 +- be/src/storage/data_dir.h | 5 ++++- be/src/storage/storage_engine.cpp | 26 ++++++++++++++++++++++---- be/src/storage/storage_engine.h | 5 +++-- 5 files changed, 45 insertions(+), 15 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index 53a957cf6921a..6b75833651d08 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -529,17 +529,21 @@ CONF_mInt32(max_consumer_num_per_group, "3"); // Max pulsar consumer num in one data consumer group, for routine load. CONF_mInt32(max_pulsar_consumer_num_per_group, "10"); +<<<<<<< HEAD // The size of thread pool for routine load task. // this should be larger than FE config 'max_concurrent_task_num_per_be' (default 5). CONF_Int32(routine_load_thread_pool_size, "10"); // kafka reqeust timeout +======= +// kafka request timeout +>>>>>>> dbf5fc61e ([Enhancement] Optimize the root path choosing logic on tablet creation (#26238)) CONF_Int32(routine_load_kafka_timeout_second, "10"); -// pulsar reqeust timeout +// pulsar request timeout CONF_Int32(routine_load_pulsar_timeout_second, "10"); -// Is set to true, index loading failure will not causing BE exit, +// Is set to true, index loading failure will not cause BE exit, // and the tablet will be marked as bad, so that FE will try to repair it. // CONF_Bool(auto_recover_index_loading_failure, "false"); @@ -583,6 +587,10 @@ CONF_mInt32(path_scan_interval_second, "86400"); CONF_mInt32(storage_flood_stage_usage_percent, "95"); // 95% // The min bytes that should be left of a data dir CONF_mInt64(storage_flood_stage_left_capacity_bytes, "107374182400"); // 100GB +// When choosing storage root path for tablet creation, disks with usage larger than the +// average value by `storage_high_usage_disk_protect_ratio` won't be chosen at first. +CONF_mDouble(storage_high_usage_disk_protect_ratio, "0.1"); // 10% + // Number of thread for flushing memtable per store. CONF_mInt32(flush_thread_num_per_store, "2"); @@ -596,21 +604,21 @@ CONF_Int64(brpc_max_body_size, "2147483648"); CONF_Int64(brpc_socket_max_unwritten_bytes, "1073741824"); // Max number of txns for every txn_partition_map in txn manager. -// this is a self protection to avoid too many txns saving in manager. +// this is a self-protection to avoid too many txns saving in manager. CONF_mInt64(max_runnings_transactions_per_txn_map, "100"); // The tablet map shard size, the value must be power of two. -// this is a an enhancement for better performance to manage tablet. +// this is an enhancement for better performance to manage tablet. CONF_Int32(tablet_map_shard_size, "32"); CONF_String(plugin_path, "${STARROCKS_HOME}/plugin"); // txn_map_lock shard size, the value is 2^n, n=0,1,2,3,4 -// this is a an enhancement for better performance to manage txn. +// this is an enhancement for better performance to manage txn. CONF_Int32(txn_map_shard_size, "128"); // txn_lock shard size, the value is 2^n, n=0,1,2,3,4 -// this is a an enhancement for better performance to commit and publish txn. +// this is an enhancement for better performance to commit and publish txn. CONF_Int32(txn_shard_size, "1024"); // Whether to continue to start be when load tablet from header failed. @@ -656,7 +664,7 @@ CONF_mInt16(storage_format_version, "2"); // 1 for LZ4_NULL CONF_mInt16(null_encoding, "0"); -// Do pre-aggregate if effect great than the factor, factor range:[1-100]. +// Do pre-aggregate if effect greater than the factor, factor range:[1-100]. CONF_Int16(pre_aggregate_factor, "80"); #ifdef __x86_64__ diff --git a/be/src/storage/data_dir.cpp b/be/src/storage/data_dir.cpp index 7c1f1ceabfc03..00afb85b61323 100644 --- a/be/src/storage/data_dir.cpp +++ b/be/src/storage/data_dir.cpp @@ -537,7 +537,7 @@ Status DataDir::update_capacity() { } bool DataDir::capacity_limit_reached(int64_t incoming_data_size) { - double used_pct = (_disk_capacity_bytes - _available_bytes + incoming_data_size) / (double)_disk_capacity_bytes; + double used_pct = disk_usage(incoming_data_size); int64_t left_bytes = _available_bytes - incoming_data_size; if (used_pct >= config::storage_flood_stage_usage_percent / 100.0 && diff --git a/be/src/storage/data_dir.h b/be/src/storage/data_dir.h index f30c925f80755..f9611d87069fa 100644 --- a/be/src/storage/data_dir.h +++ b/be/src/storage/data_dir.h @@ -76,6 +76,9 @@ class DataDir { int64_t available_bytes() const { return _available_bytes; } int64_t disk_capacity_bytes() const { return _disk_capacity_bytes; } + double disk_usage(int64_t incoming_data_size) const { + return (double)(_disk_capacity_bytes - _available_bytes + incoming_data_size) / (double)_disk_capacity_bytes; + } // save a cluster_id file under data path to prevent // invalid be config for example two be use the same @@ -118,7 +121,7 @@ class DataDir { // TODO(cmy): for now we can not precisely calculate the capacity StarRocks used, // so in order to avoid running out of disk capacity, we currently use the actual // disk available capacity and total capacity to do the calculation. - // So that the capacity StarRocks actually used may exceeds the user specified capacity. + // So that the capacity StarRocks actually used may exceed the user specified capacity. bool capacity_limit_reached(int64_t incoming_data_size); Status update_capacity(); diff --git a/be/src/storage/storage_engine.cpp b/be/src/storage/storage_engine.cpp index d06ab38a58f0a..438ab064e8a7e 100644 --- a/be/src/storage/storage_engine.cpp +++ b/be/src/storage/storage_engine.cpp @@ -452,13 +452,31 @@ std::vector StorageEngine::get_stores_for_create_tablet(TStorageMedium } } + // sort by disk usage in asc order std::sort(stores.begin(), stores.end(), - [](const auto& a, const auto& b) { return a->available_bytes() > b->available_bytes(); }); + [](const auto& a, const auto& b) { return a->disk_usage(0) < b->disk_usage(0); }); + + // compute average usage of all disks + double avg_disk_usage = 0.0; + double usage_sum = 0.0; + for (const auto& v : stores) { + usage_sum += v->disk_usage(0); + } + avg_disk_usage = usage_sum / stores.size(); + + // find the last root path which will participate in vector shuffle so that all the paths + // before and included can be chosen to create tablet on preferentially + size_t last_candidate_idx = 0; + for (const auto v : stores) { + if (v->disk_usage(0) > avg_disk_usage + config::storage_high_usage_disk_protect_ratio) { + break; + } + last_candidate_idx++; + } - const int mid = stores.size() / 2 + 1; - // TODO(lingbin): should it be a global util func? + // randomize the preferential paths to balance number of tablets each disk has std::srand(std::random_device()()); - std::shuffle(stores.begin(), stores.begin() + mid, std::mt19937(std::random_device()())); + std::shuffle(stores.begin(), stores.begin() + last_candidate_idx, std::mt19937(std::random_device()())); return stores; } diff --git a/be/src/storage/storage_engine.h b/be/src/storage/storage_engine.h index 258941ca0d254..cb55994757b15 100644 --- a/be/src/storage/storage_engine.h +++ b/be/src/storage/storage_engine.h @@ -92,8 +92,9 @@ class StorageEngine { Status get_all_data_dir_info(std::vector* data_dir_infos, bool need_update); std::vector get_store_paths(); - // get root path for creating tablet. The returned vector of root path should be random, - // for avoiding that all the tablet would be deployed one disk. + // Get root path vector for creating tablet. The returned vector is sorted by the disk usage in asc order, + // then the front portion of the vector excluding paths which have high disk usage is shuffled to avoid + // the newly created tablet is distributed on only on specific path. std::vector get_stores_for_create_tablet(TStorageMedium::type storage_medium); DataDir* get_store(const std::string& path); DataDir* get_store(int64_t path_hash);