Skip to content

Commit

Permalink
[shortfin] Add heuristics for adjusting file descriptor limits on Lin…
Browse files Browse the repository at this point in the history
…ux. (#465)

Without this, on very large systems (i.e. 64 GPU / 192 Core), it was not
possible to open all devices without manual tweaks to file handle
descriptor limits. The result were various forms of RESOURCE_EXHAUSTED
errors. This may require more tweaking in the future, and for fully
robust setups, production installations should explicitly configure high
limits. However, these heuristics remove a significant barrier to entry
and provide some feedback in terms of logs.

Progress on #463
  • Loading branch information
stellaraccident authored Nov 9, 2024
1 parent 2cbf768 commit eefc353
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 0 deletions.
17 changes: 17 additions & 0 deletions shortfin/src/shortfin/local/systems/amdgpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "shortfin/local/systems/amdgpu.h"

#include "shortfin/support/logging.h"
#include "shortfin/support/sysconfig.h"

namespace shortfin::local::systems {

Expand Down Expand Up @@ -190,6 +191,22 @@ SystemPtr AMDGPUSystemBuilder::CreateSystem() {
}
}

// Estimate the resource requirements for the requested number of devices.
// As of 2024-11-08, the number of file handles required to open 64 device
// partitions was 31 times the number to open one device. Because it is not
// good to run near the limit, we conservatively round that up to 64 above
// an arbitrary baseline of 768. This means that on a small, four device
// system, we will not request to raise limits for the Linux default of
// 1024 file handles, but we will raise for everything larger (which tends
// to be where the problems are).
size_t expected_device_count =
used_device_ids.size() * logical_devices_per_physical_device_;
if (!sysconfig::EnsureFileLimit(expected_device_count * 64 + 768)) {
logging::error(
"Could not ensure sufficient file handles for minimum operations: "
"Suggest setting explicit limits with `ulimit -n` and system settings");
}

// Initialize all used GPU devices.
for (size_t instance_ordinal = 0; instance_ordinal < used_device_ids.size();
++instance_ordinal) {
Expand Down
18 changes: 18 additions & 0 deletions shortfin/src/shortfin/local/systems/host.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "iree/hal/local/loaders/registration/init.h"
#include "shortfin/support/iree_helpers.h"
#include "shortfin/support/logging.h"
#include "shortfin/support/sysconfig.h"

namespace shortfin::local::systems {

Expand Down Expand Up @@ -149,6 +150,8 @@ iree_hal_driver_t *HostCPUSystemBuilder::InitializeHostCPUDriver(System &lsys) {
}

// Create one queue executor per node.
unsigned total_needed_file_handles = 512;
bool has_issued_limit_error = false;
std::vector<iree::task_executor_ptr> queue_executors;
queue_executors.reserve(selected_nodes.size());
queue_node_ids_.reserve(selected_nodes.size());
Expand All @@ -162,6 +165,21 @@ iree_hal_driver_t *HostCPUSystemBuilder::InitializeHostCPUDriver(System &lsys) {
node_id, iree_task_topology_group_count(&topology.topology));
queue_executors.push_back({});
auto &executor = queue_executors.back();
// As of 2024-11-8, it took approximately 32 file handles per node-group.
// To be conservative because file handle limits are basically free, we
// round up to 64 and assume a floor of 512. This allows small, default
// 8 group, single node configs to require no limit increase for Linux
// 1024 default cases.
total_needed_file_handles += 64 * topology.topology.group_count;
if (!sysconfig::EnsureFileLimit(total_needed_file_handles) &&
!has_issued_limit_error) {
logging::error(
"Could not ensure sufficient file handles for minimum operations: "
"Suggest setting explicit limits with `ulimit -n` and system "
"settings");
has_issued_limit_error = true;
}

SHORTFIN_THROW_IF_ERROR(iree_task_executor_create(
host_cpu_deps_.task_executor_options, &topology.topology,
host_allocator(), executor.for_output()));
Expand Down
2 changes: 2 additions & 0 deletions shortfin/src/shortfin/support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ shortfin_cc_component(
iree_concurrency.h
logging.h
stl_extras.h
sysconfig.h
SRCS
blocking_executor.cc
config.cc
globals.cc
iree_helpers.cc
logging.cc
sysconfig.cc
DEPS
iree_base_base
# TODO: Maybe reclassify some of these low level, shared support entities
Expand Down
63 changes: 63 additions & 0 deletions shortfin/src/shortfin/support/sysconfig.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright 2024 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "shortfin/support/sysconfig.h"

#include "shortfin/support/logging.h"

#ifdef __linux__
#include <sys/resource.h>
#endif

namespace shortfin::sysconfig {

// -----------------------------------------------------------------------------
// File handle limits
// -----------------------------------------------------------------------------

#ifdef __linux__

bool EnsureFileLimit(unsigned needed_limit) {
struct rlimit limit;
if (getrlimit(RLIMIT_NOFILE, &limit) != 0) {
return {};
}

if (limit.rlim_cur >= needed_limit) return true;
unsigned requested_limit = needed_limit;
if (limit.rlim_max >= needed_limit) {
logging::debug(
"Estimated number of open file handles ({}) < current limit ({}) but "
"within max limit ({}): Increasing limit",
needed_limit, limit.rlim_cur, limit.rlim_max);
} else if (limit.rlim_max > limit.rlim_cur) {
logging::warn(
"Esimated number of open file handles ({}) < current ({}) and max ({}) "
"limit: Increasing to max",
needed_limit, limit.rlim_cur, limit.rlim_max);
requested_limit = limit.rlim_max;
} else {
logging::warn("Esimated number of open file handles ({}) < max ({})",
needed_limit, limit.rlim_max);
return false;
}

limit.rlim_cur = requested_limit;
if (setrlimit(RLIMIT_NOFILE, &limit) != 0) {
logging::error("Could not set open file handle limit to {}",
requested_limit);
return false;
}

return limit.rlim_cur >= needed_limit;
}

#else
// Fallback implementation.
bool EnsureFileLimit(unsigned needed_limit) { return true; }
#endif

} // namespace shortfin::sysconfig
25 changes: 25 additions & 0 deletions shortfin/src/shortfin/support/sysconfig.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright 2024 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef SHORTFIN_SUPPORT_SYSCONFIG_H
#define SHORTFIN_SUPPORT_SYSCONFIG_H

#include <cstdint>
#include <utility>

namespace shortfin::sysconfig {

// Attempts to ensure that the given number of file descriptors can be created.
// If the system does not support such a thing (i.e. GetOpenFileLimit() returns
// nothing), then nothing is done and true is returned. If the system does
// support it and heuristics say this should be allowed, then true will return.
// Otherwise, a warning will be logged and false returned.
// This is a best effort attempt.
bool EnsureFileLimit(unsigned needed_limit);

} // namespace shortfin::sysconfig

#endif // SHORTFIN_SUPPORT_SYSCONFIG_H

0 comments on commit eefc353

Please sign in to comment.