From eefc3531388cd74f4fa331f027bb756d48ddbdf1 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Fri, 8 Nov 2024 20:58:06 -0800 Subject: [PATCH] [shortfin] Add heuristics for adjusting file descriptor limits on Linux. (#465) Without this, on very large systems (i.e. 64 GPU / 192 Core), it was not possible to open all devices without manual tweaks to file handle descriptor limits. The result were various forms of RESOURCE_EXHAUSTED errors. This may require more tweaking in the future, and for fully robust setups, production installations should explicitly configure high limits. However, these heuristics remove a significant barrier to entry and provide some feedback in terms of logs. Progress on #463 --- shortfin/src/shortfin/local/systems/amdgpu.cc | 17 +++++ shortfin/src/shortfin/local/systems/host.cc | 18 ++++++ shortfin/src/shortfin/support/CMakeLists.txt | 2 + shortfin/src/shortfin/support/sysconfig.cc | 63 +++++++++++++++++++ shortfin/src/shortfin/support/sysconfig.h | 25 ++++++++ 5 files changed, 125 insertions(+) create mode 100644 shortfin/src/shortfin/support/sysconfig.cc create mode 100644 shortfin/src/shortfin/support/sysconfig.h diff --git a/shortfin/src/shortfin/local/systems/amdgpu.cc b/shortfin/src/shortfin/local/systems/amdgpu.cc index 2625e8325..78efad709 100644 --- a/shortfin/src/shortfin/local/systems/amdgpu.cc +++ b/shortfin/src/shortfin/local/systems/amdgpu.cc @@ -7,6 +7,7 @@ #include "shortfin/local/systems/amdgpu.h" #include "shortfin/support/logging.h" +#include "shortfin/support/sysconfig.h" namespace shortfin::local::systems { @@ -190,6 +191,22 @@ SystemPtr AMDGPUSystemBuilder::CreateSystem() { } } + // Estimate the resource requirements for the requested number of devices. + // As of 2024-11-08, the number of file handles required to open 64 device + // partitions was 31 times the number to open one device. Because it is not + // good to run near the limit, we conservatively round that up to 64 above + // an arbitrary baseline of 768. This means that on a small, four device + // system, we will not request to raise limits for the Linux default of + // 1024 file handles, but we will raise for everything larger (which tends + // to be where the problems are). + size_t expected_device_count = + used_device_ids.size() * logical_devices_per_physical_device_; + if (!sysconfig::EnsureFileLimit(expected_device_count * 64 + 768)) { + logging::error( + "Could not ensure sufficient file handles for minimum operations: " + "Suggest setting explicit limits with `ulimit -n` and system settings"); + } + // Initialize all used GPU devices. for (size_t instance_ordinal = 0; instance_ordinal < used_device_ids.size(); ++instance_ordinal) { diff --git a/shortfin/src/shortfin/local/systems/host.cc b/shortfin/src/shortfin/local/systems/host.cc index 5629979e4..440a3ff51 100644 --- a/shortfin/src/shortfin/local/systems/host.cc +++ b/shortfin/src/shortfin/local/systems/host.cc @@ -11,6 +11,7 @@ #include "iree/hal/local/loaders/registration/init.h" #include "shortfin/support/iree_helpers.h" #include "shortfin/support/logging.h" +#include "shortfin/support/sysconfig.h" namespace shortfin::local::systems { @@ -149,6 +150,8 @@ iree_hal_driver_t *HostCPUSystemBuilder::InitializeHostCPUDriver(System &lsys) { } // Create one queue executor per node. + unsigned total_needed_file_handles = 512; + bool has_issued_limit_error = false; std::vector queue_executors; queue_executors.reserve(selected_nodes.size()); queue_node_ids_.reserve(selected_nodes.size()); @@ -162,6 +165,21 @@ iree_hal_driver_t *HostCPUSystemBuilder::InitializeHostCPUDriver(System &lsys) { node_id, iree_task_topology_group_count(&topology.topology)); queue_executors.push_back({}); auto &executor = queue_executors.back(); + // As of 2024-11-8, it took approximately 32 file handles per node-group. + // To be conservative because file handle limits are basically free, we + // round up to 64 and assume a floor of 512. This allows small, default + // 8 group, single node configs to require no limit increase for Linux + // 1024 default cases. + total_needed_file_handles += 64 * topology.topology.group_count; + if (!sysconfig::EnsureFileLimit(total_needed_file_handles) && + !has_issued_limit_error) { + logging::error( + "Could not ensure sufficient file handles for minimum operations: " + "Suggest setting explicit limits with `ulimit -n` and system " + "settings"); + has_issued_limit_error = true; + } + SHORTFIN_THROW_IF_ERROR(iree_task_executor_create( host_cpu_deps_.task_executor_options, &topology.topology, host_allocator(), executor.for_output())); diff --git a/shortfin/src/shortfin/support/CMakeLists.txt b/shortfin/src/shortfin/support/CMakeLists.txt index cbf171894..ea8572466 100644 --- a/shortfin/src/shortfin/support/CMakeLists.txt +++ b/shortfin/src/shortfin/support/CMakeLists.txt @@ -16,12 +16,14 @@ shortfin_cc_component( iree_concurrency.h logging.h stl_extras.h + sysconfig.h SRCS blocking_executor.cc config.cc globals.cc iree_helpers.cc logging.cc + sysconfig.cc DEPS iree_base_base # TODO: Maybe reclassify some of these low level, shared support entities diff --git a/shortfin/src/shortfin/support/sysconfig.cc b/shortfin/src/shortfin/support/sysconfig.cc new file mode 100644 index 000000000..486f5ffc4 --- /dev/null +++ b/shortfin/src/shortfin/support/sysconfig.cc @@ -0,0 +1,63 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "shortfin/support/sysconfig.h" + +#include "shortfin/support/logging.h" + +#ifdef __linux__ +#include +#endif + +namespace shortfin::sysconfig { + +// ----------------------------------------------------------------------------- +// File handle limits +// ----------------------------------------------------------------------------- + +#ifdef __linux__ + +bool EnsureFileLimit(unsigned needed_limit) { + struct rlimit limit; + if (getrlimit(RLIMIT_NOFILE, &limit) != 0) { + return {}; + } + + if (limit.rlim_cur >= needed_limit) return true; + unsigned requested_limit = needed_limit; + if (limit.rlim_max >= needed_limit) { + logging::debug( + "Estimated number of open file handles ({}) < current limit ({}) but " + "within max limit ({}): Increasing limit", + needed_limit, limit.rlim_cur, limit.rlim_max); + } else if (limit.rlim_max > limit.rlim_cur) { + logging::warn( + "Esimated number of open file handles ({}) < current ({}) and max ({}) " + "limit: Increasing to max", + needed_limit, limit.rlim_cur, limit.rlim_max); + requested_limit = limit.rlim_max; + } else { + logging::warn("Esimated number of open file handles ({}) < max ({})", + needed_limit, limit.rlim_max); + return false; + } + + limit.rlim_cur = requested_limit; + if (setrlimit(RLIMIT_NOFILE, &limit) != 0) { + logging::error("Could not set open file handle limit to {}", + requested_limit); + return false; + } + + return limit.rlim_cur >= needed_limit; +} + +#else +// Fallback implementation. +bool EnsureFileLimit(unsigned needed_limit) { return true; } +#endif + +} // namespace shortfin::sysconfig diff --git a/shortfin/src/shortfin/support/sysconfig.h b/shortfin/src/shortfin/support/sysconfig.h new file mode 100644 index 000000000..864405efc --- /dev/null +++ b/shortfin/src/shortfin/support/sysconfig.h @@ -0,0 +1,25 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef SHORTFIN_SUPPORT_SYSCONFIG_H +#define SHORTFIN_SUPPORT_SYSCONFIG_H + +#include +#include + +namespace shortfin::sysconfig { + +// Attempts to ensure that the given number of file descriptors can be created. +// If the system does not support such a thing (i.e. GetOpenFileLimit() returns +// nothing), then nothing is done and true is returned. If the system does +// support it and heuristics say this should be allowed, then true will return. +// Otherwise, a warning will be logged and false returned. +// This is a best effort attempt. +bool EnsureFileLimit(unsigned needed_limit); + +} // namespace shortfin::sysconfig + +#endif // SHORTFIN_SUPPORT_SYSCONFIG_H