From a169c78b6d3b082068deb49a39aaa1fd75464c7f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 17 May 2024 12:10:31 +0200
Subject: [PATCH] Improve performance on NUMA systems

Allow for NUMA memory replication for NNUE weights.  Bind threads to ensure execution on a specific NUMA node.

This patch introduces NUMA memory replication, currently only utilized for the NNUE weights. Along with it comes all machinery required to identify NUMA nodes and bind threads to specific processors/nodes. It also comes with small changes to Thread and ThreadPool to allow easier execution of custom functions on the designated thread. Old thread binding (WinProcGroup) machinery is removed because it's incompatible with this patch. Small changes to unrelated parts of the code were made to ensure correctness, like some classes being made unmovable, raw pointers replaced with unique_ptr. etc.

Windows 7 and Windows 10 is partially supported. Windows 11 is fully supported. Linux is fully supported, with explicit exclusion of Android. No additional dependencies.

-----------------

A new UCI option `NumaPolicy` is introduced. It can take the following values:
```
system - gathers NUMA node information from the system (lscpu or windows api), for each threads binds it to a single NUMA node
none - assumes there is 1 NUMA node, never binds threads
auto - this is the default value, depends on the number of set threads and NUMA nodes, will only enable binding on multinode systems and when the number of threads reaches a threshold (dependent on node size and count)
[[custom]] -
  // ':'-separated numa nodes
  // ','-separated cpu indices
  // supports "first-last" range syntax for cpu indices,
  for example '0-15,32-47:16-31,48-63'
```

Setting `NumaPolicy` forces recreation of the threads in the ThreadPool, which in turn forces the recreation of the TT.

The threads are distributed among NUMA nodes in a round-robin fashion based on fill percentage (i.e. it will strive to fill all NUMA nodes evenly). Threads are bound to NUMA nodes, not specific processors, because that's our only requirement and the OS can schedule them better.

Special care is made that maximum memory usage on systems that do not require memory replication stays as previously, that is, unnecessary copies are avoided.

On linux the process' processor affinity is respected. This means that if you for example use taskset to restrict Stockfish to a single NUMA node then the `system` and `auto` settings will only see a single NUMA node (more precisely, the processors included in the current affinity mask) and act accordingly.

-----------------

We can't ensure that a memory allocation takes place on a given NUMA node without using libnuma on linux, or using appropriate custom allocators on windows (https://learn.microsoft.com/en-us/windows/win32/memory/allocating-memory-from-a-numa-node), so to avoid complications the current implementation relies on first-touch policy. Due to this we also rely on the memory allocator to give us a new chunk of untouched memory from the system. This appears to work reliably on linux, but results may vary.

MacOS is not supported, because AFAIK it's not affected, and implementation would be problematic anyway.

Windows is supported since Windows 7 (https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity). Until Windows 11/Server 2022 NUMA nodes are split such that they cannot span processor groups. This is because before Windows 11/Server 2022 it's not possible to set thread affinity spanning processor groups. The splitting is done manually in some cases (required after Windows 10 Build 20348). Since Windows 11/Server 2022 we can set affinites spanning processor group so this splitting is not done, so the behaviour is pretty much like on linux.

Linux is supported, **without** libnuma requirement. `lscpu` is expected.

-----------------

Passed 60+1 @ 256t 16000MB hash: https://tests.stockfishchess.org/tests/view/6654e443a86388d5e27db0d8
```
LLR: 2.95 (-2.94,2.94) <0.00,10.00>
Total: 278 W: 110 L: 29 D: 139
Ptnml(0-2): 0, 1, 56, 82, 0
```

Passed SMP STC: https://tests.stockfishchess.org/tests/view/6654fc74a86388d5e27db1cd
```
LLR: 2.95 (-2.94,2.94) <-1.75,0.25>
Total: 67152 W: 17354 L: 17177 D: 32621
Ptnml(0-2): 64, 7428, 18408, 7619, 57
```

Passed STC: https://tests.stockfishchess.org/tests/view/6654fb27a86388d5e27db15c
```
LLR: 2.94 (-2.94,2.94) <-1.75,0.25>
Total: 131648 W: 34155 L: 34045 D: 63448
Ptnml(0-2): 426, 13878, 37096, 14008, 416
```

fixes #5253
closes https://github.com/official-stockfish/Stockfish/pull/5285

No functional change
---
 .github/ci/libcxx17.imp |   1 +
 src/Makefile            |   2 +-
 src/engine.cpp          |  88 +++-
 src/engine.h            |  31 +-
 src/misc.cpp            | 134 +-----
 src/misc.h              |  56 ++-
 src/nnue/network.cpp    |  42 ++
 src/nnue/network.h      |   6 +
 src/numa.h              | 904 ++++++++++++++++++++++++++++++++++++++++
 src/search.cpp          |  41 +-
 src/search.h            |  37 +-
 src/thread.cpp          | 192 ++++++---
 src/thread.h            |  91 +++-
 src/tt.cpp              |  29 +-
 src/tt.h                |   5 +-
 src/uci.cpp             |  42 +-
 src/uci.h               |   3 +
 src/ucioption.cpp       |   2 +
 src/ucioption.h         |   1 +
 19 files changed, 1418 insertions(+), 289 deletions(-)
 create mode 100644 src/numa.h

diff --git a/.github/ci/libcxx17.imp b/.github/ci/libcxx17.imp
index 7bdcf5bc2de..d3a262b54e8 100644
--- a/.github/ci/libcxx17.imp
+++ b/.github/ci/libcxx17.imp
@@ -7,6 +7,7 @@
     { include: [ "<__fwd/sstream.h>", private, "<iosfwd>", public ] },
     { include: [ "<__fwd/streambuf.h>", private, "<iosfwd>", public ] },
     { include: [ "<__fwd/string_view.h>", private, "<string_view>", public ] },
+    { include: [ "<__system_error/errc.h>", private, "<system_error>", public ] },
 
     # Mappings for includes between public headers
     { include: [ "<ios>", public, "<iostream>", public ] },
diff --git a/src/Makefile b/src/Makefile
index 45f38b01322..5119b615f6b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -63,7 +63,7 @@ HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
 		nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
 		nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
 		search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
-		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h
+		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
diff --git a/src/engine.cpp b/src/engine.cpp
index e8da24aa9e8..3fc27223a09 100644
--- a/src/engine.cpp
+++ b/src/engine.cpp
@@ -18,15 +18,15 @@
 
 #include "engine.h"
 
+#include <cassert>
 #include <deque>
+#include <iosfwd>
 #include <memory>
 #include <ostream>
+#include <sstream>
 #include <string_view>
 #include <utility>
 #include <vector>
-#include <sstream>
-#include <iosfwd>
-#include <cassert>
 
 #include "evaluate.h"
 #include "misc.h"
@@ -48,10 +48,14 @@ constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq -
 
 Engine::Engine(std::string path) :
     binaryDirectory(CommandLine::get_binary_directory(path)),
+    numaContext(NumaConfig::from_system()),
     states(new std::deque<StateInfo>(1)),
-    networks(NN::Networks(
-      NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG),
-      NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) {
+    threads(),
+    networks(
+      numaContext,
+      NN::Networks(
+        NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG),
+        NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) {
     pos.set(StartFEN, false, &states->back());
     capSq = SQ_NONE;
 }
@@ -74,7 +78,7 @@ void Engine::stop() { threads.stop = true; }
 void Engine::search_clear() {
     wait_for_search_finished();
 
-    tt.clear(options["Threads"]);
+    tt.clear(threads);
     threads.clear();
 
     // @TODO wont work with multiple instances
@@ -124,11 +128,35 @@ void Engine::set_position(const std::string& fen, const std::vector<std::string>
 
 // modifiers
 
-void Engine::resize_threads() { threads.set({options, threads, tt, networks}, updateContext); }
+void Engine::set_numa_config_from_option(const std::string& o) {
+    if (o == "auto" || o == "system")
+    {
+        numaContext.set_numa_config(NumaConfig::from_system());
+    }
+    else if (o == "none")
+    {
+        numaContext.set_numa_config(NumaConfig{});
+    }
+    else
+    {
+        numaContext.set_numa_config(NumaConfig::from_string(o));
+    }
+
+    // Force reallocation of threads in case affinities need to change.
+    resize_threads();
+}
+
+void Engine::resize_threads() {
+    threads.wait_for_search_finished();
+    threads.set(numaContext.get_numa_config(), {options, threads, tt, networks}, updateContext);
+
+    // Reallocate the hash with the new threadpool size
+    set_tt_size(options["Hash"]);
+}
 
 void Engine::set_tt_size(size_t mb) {
     wait_for_search_finished();
-    tt.resize(mb, options["Threads"]);
+    tt.resize(mb, threads);
 }
 
 void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; }
@@ -136,28 +164,35 @@ void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; }
 // network related
 
 void Engine::verify_networks() const {
-    networks.big.verify(options["EvalFile"]);
-    networks.small.verify(options["EvalFileSmall"]);
+    networks->big.verify(options["EvalFile"]);
+    networks->small.verify(options["EvalFileSmall"]);
 }
 
 void Engine::load_networks() {
-    load_big_network(options["EvalFile"]);
-    load_small_network(options["EvalFileSmall"]);
+    networks.modify_and_replicate([this](NN::Networks& networks_) {
+        networks_.big.load(binaryDirectory, options["EvalFile"]);
+        networks_.small.load(binaryDirectory, options["EvalFileSmall"]);
+    });
+    threads.clear();
 }
 
 void Engine::load_big_network(const std::string& file) {
-    networks.big.load(binaryDirectory, file);
+    networks.modify_and_replicate(
+      [this, &file](NN::Networks& networks_) { networks_.big.load(binaryDirectory, file); });
     threads.clear();
 }
 
 void Engine::load_small_network(const std::string& file) {
-    networks.small.load(binaryDirectory, file);
+    networks.modify_and_replicate(
+      [this, &file](NN::Networks& networks_) { networks_.small.load(binaryDirectory, file); });
     threads.clear();
 }
 
 void Engine::save_network(const std::pair<std::optional<std::string>, std::string> files[2]) {
-    networks.big.save(files[0].first);
-    networks.small.save(files[1].first);
+    networks.modify_and_replicate([&files](NN::Networks& networks_) {
+        networks_.big.save(files[0].first);
+        networks_.small.save(files[1].first);
+    });
 }
 
 // utility functions
@@ -169,7 +204,7 @@ void Engine::trace_eval() const {
 
     verify_networks();
 
-    sync_cout << "\n" << Eval::trace(p, networks) << sync_endl;
+    sync_cout << "\n" << Eval::trace(p, *networks) << sync_endl;
 }
 
 OptionsMap& Engine::get_options() { return options; }
@@ -184,4 +219,21 @@ std::string Engine::visualize() const {
     return ss.str();
 }
 
+std::vector<std::pair<size_t, size_t>> Engine::get_bound_thread_count_by_numa_node() const {
+    auto                                   counts = threads.get_bound_thread_count_by_numa_node();
+    const NumaConfig&                      cfg    = numaContext.get_numa_config();
+    std::vector<std::pair<size_t, size_t>> ratios;
+    NumaIndex                              n = 0;
+    for (; n < counts.size(); ++n)
+        ratios.emplace_back(counts[n], cfg.num_cpus_in_numa_node(n));
+    if (!counts.empty())
+        for (; n < cfg.num_numa_nodes(); ++n)
+            ratios.emplace_back(0, cfg.num_cpus_in_numa_node(n));
+    return ratios;
+}
+
+std::string Engine::get_numa_config_as_string() const {
+    return numaContext.get_numa_config().to_string();
+}
+
 }
diff --git a/src/engine.h b/src/engine.h
index 64a814cb4aa..91a8a96b0dc 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -35,6 +35,7 @@
 #include "thread.h"
 #include "tt.h"
 #include "ucioption.h"
+#include "numa.h"
 
 namespace Stockfish {
 
@@ -47,6 +48,13 @@ class Engine {
     using InfoIter  = Search::InfoIteration;
 
     Engine(std::string path = "");
+
+    // Can't be movable due to components holding backreferences to fields
+    Engine(const Engine&)            = delete;
+    Engine(Engine&&)                 = delete;
+    Engine& operator=(const Engine&) = delete;
+    Engine& operator=(Engine&&)      = delete;
+
     ~Engine() { wait_for_search_finished(); }
 
     std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960);
@@ -63,6 +71,7 @@ class Engine {
 
     // modifiers
 
+    void set_numa_config_from_option(const std::string& o);
     void resize_threads();
     void set_tt_size(size_t mb);
     void set_ponderhit(bool);
@@ -83,23 +92,27 @@ class Engine {
 
     // utility functions
 
-    void        trace_eval() const;
-    OptionsMap& get_options();
-    std::string fen() const;
-    void        flip();
-    std::string visualize() const;
+    void                                   trace_eval() const;
+    OptionsMap&                            get_options();
+    std::string                            fen() const;
+    void                                   flip();
+    std::string                            visualize() const;
+    std::vector<std::pair<size_t, size_t>> get_bound_thread_count_by_numa_node() const;
+    std::string                            get_numa_config_as_string() const;
 
    private:
     const std::string binaryDirectory;
 
+    NumaReplicationContext numaContext;
+
     Position     pos;
     StateListPtr states;
     Square       capSq;
 
-    OptionsMap           options;
-    ThreadPool           threads;
-    TranspositionTable   tt;
-    Eval::NNUE::Networks networks;
+    OptionsMap                           options;
+    ThreadPool                           threads;
+    TranspositionTable                   tt;
+    NumaReplicated<Eval::NNUE::Networks> networks;
 
     Search::SearchManager::UpdateContext updateContext;
 };
diff --git a/src/misc.cpp b/src/misc.cpp
index 58f804204b2..d48b75e1c28 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -48,6 +48,7 @@ using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGE
 #endif
 
 #include <atomic>
+#include <charconv>
 #include <cmath>
 #include <cstdlib>
 #include <fstream>
@@ -56,6 +57,7 @@ using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGE
 #include <mutex>
 #include <sstream>
 #include <string_view>
+#include <system_error>
 
 #include "types.h"
 
@@ -592,129 +594,6 @@ void aligned_large_pages_free(void* mem) { std_aligned_free(mem); }
 #endif
 
 
-namespace WinProcGroup {
-
-#ifndef _WIN32
-
-void bind_this_thread(size_t) {}
-
-#else
-
-namespace {
-// Retrieves logical processor information using Windows-specific
-// API and returns the best node id for the thread with index idx. Original
-// code from Texel by Peter Österlund.
-int best_node(size_t idx) {
-
-    int   threads      = 0;
-    int   nodes        = 0;
-    int   cores        = 0;
-    DWORD returnLength = 0;
-    DWORD byteOffset   = 0;
-
-    // Early exit if the needed API is not available at runtime
-    HMODULE k32  = GetModuleHandle(TEXT("Kernel32.dll"));
-    auto    fun1 = (fun1_t) (void (*)()) GetProcAddress(k32, "GetLogicalProcessorInformationEx");
-    if (!fun1)
-        return -1;
-
-    // First call to GetLogicalProcessorInformationEx() to get returnLength.
-    // We expect the call to fail due to null buffer.
-    if (fun1(RelationAll, nullptr, &returnLength))
-        return -1;
-
-    // Once we know returnLength, allocate the buffer
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *buffer, *ptr;
-    ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) malloc(returnLength);
-
-    // Second call to GetLogicalProcessorInformationEx(), now we expect to succeed
-    if (!fun1(RelationAll, buffer, &returnLength))
-    {
-        free(buffer);
-        return -1;
-    }
-
-    while (byteOffset < returnLength)
-    {
-        if (ptr->Relationship == RelationNumaNode)
-            nodes++;
-
-        else if (ptr->Relationship == RelationProcessorCore)
-        {
-            cores++;
-            threads += (ptr->Processor.Flags == LTP_PC_SMT) ? 2 : 1;
-        }
-
-        assert(ptr->Size);
-        byteOffset += ptr->Size;
-        ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) (((char*) ptr) + ptr->Size);
-    }
-
-    free(buffer);
-
-    std::vector<int> groups;
-
-    // Run as many threads as possible on the same node until the core limit is
-    // reached, then move on to filling the next node.
-    for (int n = 0; n < nodes; n++)
-        for (int i = 0; i < cores / nodes; i++)
-            groups.push_back(n);
-
-    // In case a core has more than one logical processor (we assume 2) and we
-    // still have threads to allocate, spread them evenly across available nodes.
-    for (int t = 0; t < threads - cores; t++)
-        groups.push_back(t % nodes);
-
-    // If we still have more threads than the total number of logical processors
-    // then return -1 and let the OS to decide what to do.
-    return idx < groups.size() ? groups[idx] : -1;
-}
-}
-
-
-// Sets the group affinity of the current thread
-void bind_this_thread(size_t idx) {
-
-    // Use only local variables to be thread-safe
-    int node = best_node(idx);
-
-    if (node == -1)
-        return;
-
-    // Early exit if the needed API are not available at runtime
-    HMODULE k32  = GetModuleHandle(TEXT("Kernel32.dll"));
-    auto    fun2 = fun2_t((void (*)()) GetProcAddress(k32, "GetNumaNodeProcessorMaskEx"));
-    auto    fun3 = fun3_t((void (*)()) GetProcAddress(k32, "SetThreadGroupAffinity"));
-    auto    fun4 = fun4_t((void (*)()) GetProcAddress(k32, "GetNumaNodeProcessorMask2"));
-    auto    fun5 = fun5_t((void (*)()) GetProcAddress(k32, "GetMaximumProcessorGroupCount"));
-
-    if (!fun2 || !fun3)
-        return;
-
-    if (!fun4 || !fun5)
-    {
-        GROUP_AFFINITY affinity;
-        if (fun2(node, &affinity))                         // GetNumaNodeProcessorMaskEx
-            fun3(GetCurrentThread(), &affinity, nullptr);  // SetThreadGroupAffinity
-    }
-    else
-    {
-        // If a numa node has more than one processor group, we assume they are
-        // sized equal and we spread threads evenly across the groups.
-        USHORT elements, returnedElements;
-        elements                 = fun5();  // GetMaximumProcessorGroupCount
-        GROUP_AFFINITY* affinity = (GROUP_AFFINITY*) malloc(elements * sizeof(GROUP_AFFINITY));
-        if (fun4(node, affinity, elements, &returnedElements))  // GetNumaNodeProcessorMask2
-            fun3(GetCurrentThread(), &affinity[idx % returnedElements],
-                 nullptr);  // SetThreadGroupAffinity
-        free(affinity);
-    }
-}
-
-#endif
-
-}  // namespace WinProcGroup
-
 #ifdef _WIN32
     #include <direct.h>
     #define GETCWD _getcwd
@@ -723,6 +602,15 @@ void bind_this_thread(size_t idx) {
     #define GETCWD getcwd
 #endif
 
+size_t str_to_size_t(const std::string& s) {
+    size_t value;
+    auto   result = std::from_chars(s.data(), s.data() + s.size(), value);
+
+    if (result.ec != std::errc())
+        std::exit(EXIT_FAILURE);
+
+    return value;
+}
 
 std::string CommandLine::get_binary_directory(std::string argv0) {
     std::string pathSeparator;
diff --git a/src/misc.h b/src/misc.h
index 3a905dfab49..99cbecfdd2c 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -24,10 +24,12 @@
 #include <chrono>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <iosfwd>
 #include <memory>
 #include <string>
 #include <vector>
+#include <optional>
 
 #define stringify2(x) #x
 #define stringify(x) stringify2(x)
@@ -50,6 +52,8 @@ void* aligned_large_pages_alloc(size_t size);
 // nop if mem == nullptr
 void aligned_large_pages_free(void* mem);
 
+size_t str_to_size_t(const std::string& s);
+
 // Deleter for automating release of memory area
 template<typename T>
 struct AlignedDeleter {
@@ -73,6 +77,31 @@ using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 template<typename T>
 using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 
+struct PipeDeleter {
+    void operator()(FILE* file) const {
+        if (file != nullptr)
+        {
+            pclose(file);
+        }
+    }
+};
+
+#if defined(__linux__)
+
+inline std::optional<std::string> get_system_command_output(const std::string& command) {
+    std::unique_ptr<FILE, PipeDeleter> pipe(popen(command.c_str(), "r"));
+    if (!pipe)
+        return std::nullopt;
+
+    std::string result;
+    char        buffer[1024];
+    while (fgets(buffer, sizeof(buffer), pipe.get()) != nullptr)
+        result += buffer;
+
+    return result;
+}
+
+#endif
 
 void dbg_hit_on(bool cond, int slot = 0);
 void dbg_mean_of(int64_t value, int slot = 0);
@@ -88,6 +117,24 @@ inline TimePoint now() {
       .count();
 }
 
+inline std::vector<std::string> split(const std::string& s, const std::string& delimiter) {
+    size_t                   begin = 0;
+    std::vector<std::string> res;
+
+    for (;;)
+    {
+        const size_t end = s.find(delimiter, begin);
+        if (end == std::string::npos)
+            break;
+
+        res.emplace_back(s.substr(begin, end - begin));
+        begin = end + delimiter.size();
+    }
+
+    res.emplace_back(s.substr(begin));
+
+    return res;
+}
 
 enum SyncCout {
     IO_LOCK,
@@ -194,15 +241,6 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 #endif
 }
 
-// Under Windows it is not possible for a process to run on more than one
-// logical processor group. This usually means being limited to using max 64
-// cores. To overcome this, some special platform-specific API should be
-// called to set group affinity for each thread. Original code from Texel by
-// Peter Österlund.
-namespace WinProcGroup {
-void bind_this_thread(size_t idx);
-}
-
 
 struct CommandLine {
    public:
diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp
index de2c7eca6d5..db864fcd384 100644
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -23,6 +23,7 @@
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <memory>
 #include <optional>
 #include <type_traits>
 #include <vector>
@@ -123,6 +124,47 @@ bool write_parameters(std::ostream& stream, const T& reference) {
 
 }  // namespace Detail
 
+template<typename Arch, typename Transformer>
+Network<Arch, Transformer>::Network(const Network<Arch, Transformer>& other) :
+    evalFile(other.evalFile),
+    embeddedType(other.embeddedType) {
+    if (other.featureTransformer)
+    {
+        Detail::initialize(featureTransformer);
+        *featureTransformer = *other.featureTransformer;
+    }
+    for (std::size_t i = 0; i < LayerStacks; ++i)
+    {
+        if (other.network[i])
+        {
+            Detail::initialize(network[i]);
+            *(network[i]) = *(other.network[i]);
+        }
+    }
+}
+
+template<typename Arch, typename Transformer>
+Network<Arch, Transformer>&
+Network<Arch, Transformer>::operator=(const Network<Arch, Transformer>& other) {
+    evalFile     = other.evalFile;
+    embeddedType = other.embeddedType;
+
+    if (other.featureTransformer)
+    {
+        Detail::initialize(featureTransformer);
+        *featureTransformer = *other.featureTransformer;
+    }
+    for (std::size_t i = 0; i < LayerStacks; ++i)
+    {
+        if (other.network[i])
+        {
+            Detail::initialize(network[i]);
+            *(network[i]) = *(other.network[i]);
+        }
+    }
+
+    return *this;
+}
 
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::load(const std::string& rootDirectory, std::string evalfilePath) {
diff --git a/src/nnue/network.h b/src/nnue/network.h
index 23f56663094..f0ccfafcb4c 100644
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -50,6 +50,12 @@ class Network {
         evalFile(file),
         embeddedType(type) {}
 
+    Network(const Network& other);
+    Network(Network&& other) = default;
+
+    Network& operator=(const Network& other);
+    Network& operator=(Network&& other) = default;
+
     void load(const std::string& rootDirectory, std::string evalfilePath);
     bool save(const std::optional<std::string>& filename) const;
 
diff --git a/src/numa.h b/src/numa.h
new file mode 100644
index 00000000000..c04292daf01
--- /dev/null
+++ b/src/numa.h
@@ -0,0 +1,904 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef NUMA_H_INCLUDED
+#define NUMA_H_INCLUDED
+
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+// We support linux very well, but we explicitly do NOT support Android, partially because
+// there are potential issues with `lscpu`, `popen` availability, and partially because
+// there's no NUMA environments running Android and there probably won't be.
+#if defined(__linux__) && !defined(__ANDROID__)
+    #if !defined(_GNU_SOURCE)
+        #define _GNU_SOURCE
+    #endif
+    #include <sched.h>
+#elif defined(_WIN32)
+
+// On Windows each processor group can have up to 64 processors.
+// https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
+static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64;
+
+    #if !defined(NOMINMAX)
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks
+using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
+using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
+
+#endif
+
+#include "misc.h"
+
+namespace Stockfish {
+
+using CpuIndex  = size_t;
+using NumaIndex = size_t;
+
+inline const CpuIndex SYSTEM_THREADS_NB =
+  std::max<CpuIndex>(1, std::thread::hardware_concurrency());
+
+// We want to abstract the purpose of storing the numa node index somewhat.
+// Whoever is using this does not need to know the specifics of the replication
+// machinery to be able to access NUMA replicated memory.
+class NumaReplicatedAccessToken {
+   public:
+    NumaReplicatedAccessToken() :
+        n(0) {}
+
+    explicit NumaReplicatedAccessToken(NumaIndex idx) :
+        n(idx) {}
+
+    NumaIndex get_numa_index() const { return n; }
+
+   private:
+    NumaIndex n;
+};
+
+// Designed as immutable, because there is no good reason to alter an already existing config
+// in a way that doesn't require recreating it completely, and it would be complex and expensive
+// to maintain class invariants.
+// The CPU (processor) numbers always correspond to the actual numbering used by the system.
+//     NOTE: the numbering is only valid within the process, as for example on Windows
+//           every process gets a "virtualized" set of processors that respects the current affinity
+// The NUMA node numbers MAY NOT correspond to the system's numbering of the NUMA nodes.
+// In particular, empty nodes may be removed, or the user may create custom nodes.
+// It is guaranteed that NUMA nodes are NOT empty, i.e. every node exposed by NumaConfig
+// has at least one processor assigned.
+//
+// Until Stockfish doesn't support exceptions all places where an exception should be thrown
+// are replaced by std::exit.
+class NumaConfig {
+   public:
+    NumaConfig() :
+        highestCpuIndex(0),
+        customAffinity(false) {
+        const auto numCpus = SYSTEM_THREADS_NB;
+        add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1);
+    }
+
+    static std::set<CpuIndex> get_process_affinity() {
+        std::set<CpuIndex> cpus;
+
+        // For unsupported systems, or in case of a soft error, we may assume all processors
+        // are available for use.
+        [[maybe_unused]] auto set_to_all_cpus = [&]() {
+            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+                cpus.insert(c);
+        };
+
+#if defined(__linux__) && !defined(__ANDROID__)
+
+        // cpu_set_t by default holds 1024 entries. This may not be enough soon,
+        // but there is no easy way to determine how many threads there actually is.
+        // In this case we just choose a reasonable upper bound.
+        static constexpr CpuIndex MaxNumCpus = 1024 * 64;
+
+        cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
+        if (mask == nullptr)
+            std::exit(EXIT_FAILURE);
+
+        const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
+
+        CPU_ZERO_S(masksize, mask);
+
+        const int status = sched_getaffinity(0, masksize, mask);
+
+        if (status != 0)
+        {
+            CPU_FREE(mask);
+            std::exit(EXIT_FAILURE);
+        }
+
+        for (CpuIndex c = 0; c < MaxNumCpus; ++c)
+            if (CPU_ISSET_S(c, masksize, mask))
+                cpus.insert(c);
+
+        CPU_FREE(mask);
+
+#elif defined(_WIN32)
+
+        // Windows is problematic and weird due to multiple ways of setting affinity, processor groups,
+        // and behaviour changes between versions. It's unclear if we can support this feature
+        // on Windows in the same way we do on Linux.
+        // Apparently when affinity is set via either start /affinity or msys2 taskset
+        // the function GetNumaProcessorNodeEx completely disregards the processors that we do not
+        // have affinity more. Moreover, the indices are shifted to start from 0, indicating that Windows
+        // is providing a whole new mapping of processors to this process. This is problematic in some cases
+        // but it at least allows us to [probably] support this affinity restriction feature by default.
+        // So overall, Windows appears to "virtualize" a set of processors and processor groups for every
+        // process. It's unclear if this assignment can change while the process is running.
+        // std::thread::hardware_concurrency() returns the number of processors that's consistent
+        // with GetNumaProcessorNodeEx, so we can just add all of them.
+
+        set_to_all_cpus();
+
+#else
+
+        // For other systems we assume the process is allowed to execute on all processors.
+        set_to_all_cpus();
+
+#endif
+
+        return cpus;
+    }
+
+    // This function queries the system for the mapping of processors to NUMA nodes.
+    // On Linux we utilize `lscpu` to avoid libnuma.
+    // On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see
+    // comment for Windows implementation of get_process_affinity
+    static NumaConfig from_system(bool respectProcessAffinity = true) {
+        NumaConfig cfg = empty();
+
+        std::set<CpuIndex> allowedCpus;
+
+        if (respectProcessAffinity)
+            allowedCpus = get_process_affinity();
+        else
+        {
+            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+                allowedCpus.insert(c);
+        }
+
+        auto is_cpu_allowed = [&](CpuIndex c) { return allowedCpus.count(c) == 1; };
+
+#if defined(__linux__) && !defined(__ANDROID__)
+
+        // On Linux things are straightforward, since there's no processor groups and
+        // any thread can be scheduled on all processors.
+        // This command produces output in the following form
+        // CPU NODE
+        //   0    0
+        //   1    0
+        //   2    1
+        //   3    1
+        //
+        // On some systems it may use '-' to signify no NUMA node, in which case we assume it's in node 0.
+        auto lscpuOpt = get_system_command_output("lscpu -e=cpu,node");
+        if (lscpuOpt.has_value())
+        {
+
+            std::istringstream ss(*lscpuOpt);
+
+            // skip the list header
+            ss.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+
+            while (true)
+            {
+                CpuIndex  c;
+                NumaIndex n;
+
+                ss >> c;
+
+                if (!ss)
+                    break;
+
+                ss >> n;
+
+                if (!ss)
+                {
+                    ss.clear();
+                    std::string dummy;
+                    ss >> dummy;
+                    n = 0;
+                }
+
+                if (is_cpu_allowed(c))
+                    cfg.add_cpu_to_node(n, c);
+            }
+        }
+        else
+        {
+            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+                if (is_cpu_allowed(c))
+                    cfg.add_cpu_to_node(NumaIndex{0}, c);
+        }
+
+#elif defined(_WIN32)
+
+        // Since Windows 11 and Windows Server 2022 thread affinities can span
+        // processor groups and can be set as such by a new WinAPI function.
+        static const bool CanAffinitySpanProcessorGroups = []() {
+            HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+            auto    SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
+              (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
+            return SetThreadSelectedCpuSetMasks_f != nullptr;
+        }();
+
+        WORD numProcGroups = GetActiveProcessorGroupCount();
+        for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup)
+        {
+            for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number)
+            {
+                PROCESSOR_NUMBER procnum;
+                procnum.Group    = procGroup;
+                procnum.Number   = number;
+                procnum.Reserved = 0;
+                USHORT nodeNumber;
+
+                // When start /affinity or taskset was used to run this process with restricted affinity
+                // GetNumaProcessorNodeEx will NOT correspond to the system's processor setup, instead
+                // it appears to follow a completely new processor assignment, made specifically for this process,
+                // in which processors that this process has affinity for are remapped, and only those are remapped,
+                // to form a new set of processors. In other words, we can only get processors
+                // which we have affinity for this way. This means that the behaviour for
+                // `respectProcessAffinity == false` may be unexpected when affinity is set from outside,
+                // while the behaviour for `respectProcessAffinity == true` is given by default.
+                const BOOL     status = GetNumaProcessorNodeEx(&procnum, &nodeNumber);
+                const CpuIndex c      = static_cast<CpuIndex>(procGroup) * WIN_PROCESSOR_GROUP_SIZE
+                                 + static_cast<CpuIndex>(number);
+                if (status != 0 && nodeNumber != std::numeric_limits<USHORT>::max()
+                    && is_cpu_allowed(c))
+                {
+                    cfg.add_cpu_to_node(nodeNumber, c);
+                }
+            }
+        }
+
+        // Split the NUMA nodes to be contained within a group if necessary.
+        // This is needed between Windows 10 Build 20348 and Windows 11, because
+        // the new NUMA allocation behaviour was introduced while there was
+        // still no way to set thread affinity spanning multiple processor groups.
+        // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
+        if (!CanAffinitySpanProcessorGroups)
+        {
+            NumaConfig splitCfg = empty();
+
+            NumaIndex splitNodeIndex = 0;
+            for (const auto& cpus : cfg.nodes)
+            {
+                if (cpus.empty())
+                    continue;
+
+                size_t lastProcGroupIndex = *(cpus.begin()) / WIN_PROCESSOR_GROUP_SIZE;
+                for (CpuIndex c : cpus)
+                {
+                    const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE;
+                    if (procGroupIndex != lastProcGroupIndex)
+                    {
+                        splitNodeIndex += 1;
+                        lastProcGroupIndex = procGroupIndex;
+                    }
+                    splitCfg.add_cpu_to_node(splitNodeIndex, c);
+                }
+                splitNodeIndex += 1;
+            }
+
+            cfg = std::move(splitCfg);
+        }
+
+#else
+
+        // Fallback for unsupported systems.
+        for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+            if (is_cpu_allowed(c))
+                cfg.add_cpu_to_node(NumaIndex{0}, c);
+
+#endif
+
+        // We have to ensure no empty NUMA nodes persist.
+        cfg.remove_empty_numa_nodes();
+
+        return cfg;
+    }
+
+    // ':'-separated numa nodes
+    // ','-separated cpu indices
+    // supports "first-last" range syntax for cpu indices
+    // For example "0-15,128-143:16-31,144-159:32-47,160-175:48-63,176-191"
+    static NumaConfig from_string(const std::string& s) {
+        NumaConfig cfg = empty();
+
+        NumaIndex n = 0;
+        for (auto&& nodeStr : split(s, ":"))
+        {
+            bool addedAnyCpuInThisNode = false;
+
+            for (const std::string& cpuStr : split(nodeStr, ","))
+            {
+                if (cpuStr.empty())
+                    continue;
+
+                auto parts = split(cpuStr, "-");
+                if (parts.size() == 1)
+                {
+                    const CpuIndex c = CpuIndex{str_to_size_t(parts[0])};
+                    if (!cfg.add_cpu_to_node(n, c))
+                        std::exit(EXIT_FAILURE);
+                }
+                else if (parts.size() == 2)
+                {
+                    const CpuIndex cfirst = CpuIndex{str_to_size_t(parts[0])};
+                    const CpuIndex clast  = CpuIndex{str_to_size_t(parts[1])};
+
+                    if (!cfg.add_cpu_range_to_node(n, cfirst, clast))
+                        std::exit(EXIT_FAILURE);
+                }
+                else
+                {
+                    std::exit(EXIT_FAILURE);
+                }
+
+                addedAnyCpuInThisNode = true;
+            }
+
+            if (addedAnyCpuInThisNode)
+                n += 1;
+        }
+
+        cfg.customAffinity = true;
+
+        return cfg;
+    }
+
+    NumaConfig(const NumaConfig&)            = delete;
+    NumaConfig(NumaConfig&&)                 = default;
+    NumaConfig& operator=(const NumaConfig&) = delete;
+    NumaConfig& operator=(NumaConfig&&)      = default;
+
+    bool is_cpu_assigned(CpuIndex n) const { return nodeByCpu.count(n) == 1; }
+
+    NumaIndex num_numa_nodes() const { return nodes.size(); }
+
+    CpuIndex num_cpus_in_numa_node(NumaIndex n) const {
+        assert(n < nodes.size());
+        return nodes[n].size();
+    }
+
+    CpuIndex num_cpus() const { return nodeByCpu.size(); }
+
+    bool requires_memory_replication() const { return customAffinity || nodes.size() > 1; }
+
+    std::string to_string() const {
+        std::string str;
+
+        bool isFirstNode = true;
+        for (auto&& cpus : nodes)
+        {
+            if (!isFirstNode)
+                str += ":";
+
+            bool isFirstSet = true;
+            auto rangeStart = cpus.begin();
+            for (auto it = cpus.begin(); it != cpus.end(); ++it)
+            {
+                auto next = std::next(it);
+                if (next == cpus.end() || *next != *it + 1)
+                {
+                    // cpus[i] is at the end of the range (may be of size 1)
+                    if (!isFirstSet)
+                        str += ",";
+
+                    const CpuIndex last = *it;
+
+                    if (it != rangeStart)
+                    {
+                        const CpuIndex first = *rangeStart;
+
+                        str += std::to_string(first);
+                        str += "-";
+                        str += std::to_string(last);
+                    }
+                    else
+                        str += std::to_string(last);
+
+                    rangeStart = next;
+                    isFirstSet = false;
+                }
+            }
+
+            isFirstNode = false;
+        }
+
+        return str;
+    }
+
+    bool suggests_binding_threads(CpuIndex numThreads) const {
+        // If we can reasonably determine that the threads can't be contained
+        // by the OS within the first NUMA node then we advise distributing
+        // and binding threads. When the threads are not bound we can only use
+        // NUMA memory replicated objects from the first node, so when the OS
+        // has to schedule on other nodes we lose performance.
+        // We also suggest binding if there's enough threads to distribute among nodes
+        // with minimal disparity.
+        // We try to ignore small nodes, in particular the empty ones.
+
+        // If the affinity set by the user does not match the affinity given by the OS
+        // then binding is necessary to ensure the threads are running on correct processors.
+        if (customAffinity)
+            return true;
+
+        // We obviously can't distribute a single thread, so a single thread should never be bound.
+        if (numThreads <= 1)
+            return false;
+
+        size_t largestNodeSize = 0;
+        for (auto&& cpus : nodes)
+            if (cpus.size() > largestNodeSize)
+                largestNodeSize = cpus.size();
+
+        auto is_node_small = [largestNodeSize](const std::set<CpuIndex>& node) {
+            static constexpr double SmallNodeThreshold = 0.6;
+            return static_cast<double>(node.size()) / static_cast<double>(largestNodeSize)
+                <= SmallNodeThreshold;
+        };
+
+        size_t numNotSmallNodes = 0;
+        for (auto&& cpus : nodes)
+            if (!is_node_small(cpus))
+                numNotSmallNodes += 1;
+
+        return (numThreads > largestNodeSize / 2 || numThreads >= numNotSmallNodes * 4)
+            && nodes.size() > 1;
+    }
+
+    std::vector<NumaIndex> distribute_threads_among_numa_nodes(CpuIndex numThreads) const {
+        std::vector<NumaIndex> ns;
+
+        if (nodes.size() == 1)
+        {
+            // special case for when there's no NUMA nodes
+            // doesn't buy us much, but let's keep the default path simple
+            ns.resize(numThreads, NumaIndex{0});
+        }
+        else
+        {
+            std::vector<size_t> occupation(nodes.size(), 0);
+            for (CpuIndex c = 0; c < numThreads; ++c)
+            {
+                NumaIndex bestNode{0};
+                float     bestNodeFill = std::numeric_limits<float>::max();
+                for (NumaIndex n = 0; n < nodes.size(); ++n)
+                {
+                    float fill =
+                      static_cast<float>(occupation[n] + 1) / static_cast<float>(nodes[n].size());
+                    // NOTE: Do we want to perhaps fill the first available node up to 50% first before considering other nodes?
+                    //       Probably not, because it would interfere with running multiple instances. We basically shouldn't
+                    //       favor any particular node.
+                    if (fill < bestNodeFill)
+                    {
+                        bestNode     = n;
+                        bestNodeFill = fill;
+                    }
+                }
+                ns.emplace_back(bestNode);
+                occupation[bestNode] += 1;
+            }
+        }
+
+        return ns;
+    }
+
+    NumaReplicatedAccessToken bind_current_thread_to_numa_node(NumaIndex n) const {
+        if (n >= nodes.size() || nodes[n].size() == 0)
+            std::exit(EXIT_FAILURE);
+
+#if defined(__linux__) && !defined(__ANDROID__)
+
+        cpu_set_t* mask = CPU_ALLOC(highestCpuIndex + 1);
+        if (mask == nullptr)
+            std::exit(EXIT_FAILURE);
+
+        const size_t masksize = CPU_ALLOC_SIZE(highestCpuIndex + 1);
+
+        CPU_ZERO_S(masksize, mask);
+
+        for (CpuIndex c : nodes[n])
+            CPU_SET_S(c, masksize, mask);
+
+        const int status = sched_setaffinity(0, masksize, mask);
+
+        CPU_FREE(mask);
+
+        if (status != 0)
+            std::exit(EXIT_FAILURE);
+
+        // We yield this thread just to be sure it gets rescheduled.
+        // This is defensive, allowed because this code is not performance critical.
+        sched_yield();
+
+#elif defined(_WIN32)
+
+        // Requires Windows 11. No good way to set thread affinity spanning processor groups before that.
+        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+        auto    SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
+          (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
+        auto SetThreadGroupAffinity_f =
+          SetThreadGroupAffinity_t((void (*)()) GetProcAddress(k32, "SetThreadGroupAffinity"));
+
+        if (SetThreadSelectedCpuSetMasks_f != nullptr)
+        {
+            // Only available on Windows 11 and Windows Server 2022 onwards.
+            const USHORT numProcGroups =
+              ((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE;
+            auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(numProcGroups);
+            std::memset(groupAffinities.get(), 0, sizeof(GROUP_AFFINITY) * numProcGroups);
+            for (WORD i = 0; i < numProcGroups; ++i)
+                groupAffinities[i].Group = i;
+
+            for (CpuIndex c : nodes[n])
+            {
+                const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
+                const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
+                groupAffinities[procGroupIndex].Mask |= KAFFINITY(1) << idxWithinProcGroup;
+            }
+
+            HANDLE hThread = GetCurrentThread();
+
+            const BOOL status =
+              SetThreadSelectedCpuSetMasks_f(hThread, groupAffinities.get(), numProcGroups);
+            if (status == 0)
+                std::exit(EXIT_FAILURE);
+
+            // We yield this thread just to be sure it gets rescheduled.
+            // This is defensive, allowed because this code is not performance critical.
+            SwitchToThread();
+        }
+        else if (SetThreadGroupAffinity_f != nullptr)
+        {
+            // On earlier windows version (since windows 7) we can't run a single thread
+            // on multiple processor groups, so we need to restrict the group.
+            // We assume the group of the first processor listed for this node.
+            // Processors from outside this group will not be assigned for this thread.
+            // Normally this won't be an issue because windows used to assign NUMA nodes
+            // such that they can't span processor groups. However, since Windows 10 Build 20348
+            // the behaviour changed, so there's a small window of versions between this and Windows 11
+            // that might exhibit problems with not all processors being utilized.
+            // We handle this in NumaConfig::from_system by manually splitting the nodes when
+            // we detect that there's no function to set affinity spanning processor nodes.
+            // This is required because otherwise our thread distribution code may produce
+            // suboptimal results.
+            // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
+            GROUP_AFFINITY affinity;
+            std::memset(&affinity, 0, sizeof(GROUP_AFFINITY));
+            affinity.Group = static_cast<WORD>(n);
+            // We use an ordered set so we're guaranteed to get the smallest cpu number here.
+            const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE;
+            for (CpuIndex c : nodes[n])
+            {
+                const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
+                const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
+                // We skip processors that are not in the same proccessor group.
+                // If everything was set up correctly this will never be an issue,
+                // but we have to account for bad NUMA node specification.
+                if (procGroupIndex != forcedProcGroupIndex)
+                    continue;
+
+                affinity.Mask |= KAFFINITY(1) << idxWithinProcGroup;
+            }
+
+            HANDLE hThread = GetCurrentThread();
+
+            const BOOL status = SetThreadGroupAffinity_f(hThread, &affinity, nullptr);
+            if (status == 0)
+                std::exit(EXIT_FAILURE);
+
+            // We yield this thread just to be sure it gets rescheduled.
+            // This is defensive, allowed because this code is not performance critical.
+            SwitchToThread();
+        }
+
+#endif
+
+        return NumaReplicatedAccessToken(n);
+    }
+
+    template<typename FuncT>
+    void execute_on_numa_node(NumaIndex n, FuncT&& f) const {
+        std::thread th([this, &f, n]() {
+            bind_current_thread_to_numa_node(n);
+            std::forward<FuncT>(f)();
+        });
+
+        th.join();
+    }
+
+   private:
+    std::vector<std::set<CpuIndex>> nodes;
+    std::map<CpuIndex, NumaIndex>   nodeByCpu;
+    CpuIndex                        highestCpuIndex;
+
+    bool customAffinity;
+
+    static NumaConfig empty() { return NumaConfig(EmptyNodeTag{}); }
+
+    struct EmptyNodeTag {};
+
+    NumaConfig(EmptyNodeTag) :
+        highestCpuIndex(0),
+        customAffinity(false) {}
+
+    void remove_empty_numa_nodes() {
+        std::vector<std::set<CpuIndex>> newNodes;
+        for (auto&& cpus : nodes)
+            if (!cpus.empty())
+                newNodes.emplace_back(std::move(cpus));
+        nodes = std::move(newNodes);
+    }
+
+    // Returns true if successful
+    // Returns false if failed, i.e. when the cpu is already present
+    //                          strong guarantee, the structure remains unmodified
+    bool add_cpu_to_node(NumaIndex n, CpuIndex c) {
+        if (is_cpu_assigned(c))
+            return false;
+
+        while (nodes.size() <= n)
+            nodes.emplace_back();
+
+        nodes[n].insert(c);
+        nodeByCpu[c] = n;
+
+        if (c > highestCpuIndex)
+            highestCpuIndex = c;
+
+        return true;
+    }
+
+    // Returns true if successful
+    // Returns false if failed, i.e. when any of the cpus is already present
+    //                          strong guarantee, the structure remains unmodified
+    bool add_cpu_range_to_node(NumaIndex n, CpuIndex cfirst, CpuIndex clast) {
+        for (CpuIndex c = cfirst; c <= clast; ++c)
+            if (is_cpu_assigned(c))
+                return false;
+
+        while (nodes.size() <= n)
+            nodes.emplace_back();
+
+        for (CpuIndex c = cfirst; c <= clast; ++c)
+        {
+            nodes[n].insert(c);
+            nodeByCpu[c] = n;
+        }
+
+        if (clast > highestCpuIndex)
+            highestCpuIndex = clast;
+
+        return true;
+    }
+};
+
+class NumaReplicationContext;
+
+// Instances of this class are tracked by the NumaReplicationContext instance
+// NumaReplicationContext informs all tracked instances whenever NUMA configuration changes.
+class NumaReplicatedBase {
+   public:
+    NumaReplicatedBase(NumaReplicationContext& ctx);
+
+    NumaReplicatedBase(const NumaReplicatedBase&) = delete;
+    NumaReplicatedBase(NumaReplicatedBase&& other) noexcept;
+
+    NumaReplicatedBase& operator=(const NumaReplicatedBase&) = delete;
+    NumaReplicatedBase& operator=(NumaReplicatedBase&& other) noexcept;
+
+    virtual void on_numa_config_changed() = 0;
+    virtual ~NumaReplicatedBase();
+
+    const NumaConfig& get_numa_config() const;
+
+   private:
+    NumaReplicationContext* context;
+};
+
+// We force boxing with a unique_ptr. If this becomes an issue due to added indirection we
+// may need to add an option for a custom boxing type.
+// When the NUMA config changes the value stored at the index 0 is replicated to other nodes.
+template<typename T>
+class NumaReplicated: public NumaReplicatedBase {
+   public:
+    using ReplicatorFuncType = std::function<T(const T&)>;
+
+    NumaReplicated(NumaReplicationContext& ctx) :
+        NumaReplicatedBase(ctx) {
+        replicate_from(T{});
+    }
+
+    NumaReplicated(NumaReplicationContext& ctx, T&& source) :
+        NumaReplicatedBase(ctx) {
+        replicate_from(std::move(source));
+    }
+
+    NumaReplicated(const NumaReplicated&) = delete;
+    NumaReplicated(NumaReplicated&& other) noexcept :
+        NumaReplicatedBase(std::move(other)),
+        instances(std::exchange(other.instances, {})) {}
+
+    NumaReplicated& operator=(const NumaReplicated&) = delete;
+    NumaReplicated& operator=(NumaReplicated&& other) noexcept {
+        NumaReplicatedBase::operator=(*this, std::move(other));
+        instances = std::exchange(other.instances, {});
+
+        return *this;
+    }
+
+    NumaReplicated& operator=(T&& source) {
+        replicate_from(std::move(source));
+
+        return *this;
+    }
+
+    ~NumaReplicated() override = default;
+
+    const T& operator[](NumaReplicatedAccessToken token) const {
+        assert(token.get_numa_index() < instances.size());
+        return *(instances[token.get_numa_index()]);
+    }
+
+    const T& operator*() const { return *(instances[0]); }
+
+    const T* operator->() const { return instances[0].get(); }
+
+    template<typename FuncT>
+    void modify_and_replicate(FuncT&& f) {
+        auto source = std::move(instances[0]);
+        std::forward<FuncT>(f)(*source);
+        replicate_from(std::move(*source));
+    }
+
+    void on_numa_config_changed() override {
+        // Use the first one as the source. It doesn't matter which one we use, because they all must
+        // be identical, but the first one is guaranteed to exist.
+        auto source = std::move(instances[0]);
+        replicate_from(std::move(*source));
+    }
+
+   private:
+    std::vector<std::unique_ptr<T>> instances;
+
+    void replicate_from(T&& source) {
+        instances.clear();
+
+        const NumaConfig& cfg = get_numa_config();
+        if (cfg.requires_memory_replication())
+        {
+            for (NumaIndex n = 0; n < cfg.num_numa_nodes(); ++n)
+            {
+                cfg.execute_on_numa_node(
+                  n, [this, &source]() { instances.emplace_back(std::make_unique<T>(source)); });
+            }
+        }
+        else
+        {
+            assert(cfg.num_numa_nodes() == 1);
+            // We take advantage of the fact that replication is not required
+            // and reuse the source value, avoiding one copy operation.
+            instances.emplace_back(std::make_unique<T>(std::move(source)));
+        }
+    }
+};
+
+class NumaReplicationContext {
+   public:
+    NumaReplicationContext(NumaConfig&& cfg) :
+        config(std::move(cfg)) {}
+
+    NumaReplicationContext(const NumaReplicationContext&) = delete;
+    NumaReplicationContext(NumaReplicationContext&&)      = delete;
+
+    NumaReplicationContext& operator=(const NumaReplicationContext&) = delete;
+    NumaReplicationContext& operator=(NumaReplicationContext&&)      = delete;
+
+    ~NumaReplicationContext() {
+        // The context must outlive replicated objects
+        if (!trackedReplicatedObjects.empty())
+            std::exit(EXIT_FAILURE);
+    }
+
+    void attach(NumaReplicatedBase* obj) {
+        assert(trackedReplicatedObjects.count(obj) == 0);
+        trackedReplicatedObjects.insert(obj);
+    }
+
+    void detach(NumaReplicatedBase* obj) {
+        assert(trackedReplicatedObjects.count(obj) == 1);
+        trackedReplicatedObjects.erase(obj);
+    }
+
+    // oldObj may be invalid at this point
+    void move_attached([[maybe_unused]] NumaReplicatedBase* oldObj, NumaReplicatedBase* newObj) {
+        assert(trackedReplicatedObjects.count(oldObj) == 1);
+        assert(trackedReplicatedObjects.count(newObj) == 0);
+        trackedReplicatedObjects.erase(oldObj);
+        trackedReplicatedObjects.insert(newObj);
+    }
+
+    void set_numa_config(NumaConfig&& cfg) {
+        config = std::move(cfg);
+        for (auto&& obj : trackedReplicatedObjects)
+            obj->on_numa_config_changed();
+    }
+
+    const NumaConfig& get_numa_config() const { return config; }
+
+   private:
+    NumaConfig config;
+
+    // std::set uses std::less by default, which is required for pointer comparison to be defined.
+    std::set<NumaReplicatedBase*> trackedReplicatedObjects;
+};
+
+inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicationContext& ctx) :
+    context(&ctx) {
+    context->attach(this);
+}
+
+inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicatedBase&& other) noexcept :
+    context(std::exchange(other.context, nullptr)) {
+    context->move_attached(&other, this);
+}
+
+inline NumaReplicatedBase& NumaReplicatedBase::operator=(NumaReplicatedBase&& other) noexcept {
+    context = std::exchange(other.context, nullptr);
+
+    context->move_attached(&other, this);
+
+    return *this;
+}
+
+inline NumaReplicatedBase::~NumaReplicatedBase() {
+    if (context != nullptr)
+        context->detach(this);
+}
+
+inline const NumaConfig& NumaReplicatedBase::get_numa_config() const {
+    return context->get_numa_config();
+}
+
+}  // namespace Stockfish
+
+
+#endif  // #ifndef NUMA_H_INCLUDED
diff --git a/src/search.cpp b/src/search.cpp
index 0dbc6a3a5db..c074e3421ea 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -137,15 +137,17 @@ void update_all_stats(const Position& pos,
 
 Search::Worker::Worker(SharedState&                    sharedState,
                        std::unique_ptr<ISearchManager> sm,
-                       size_t                          thread_id) :
+                       size_t                          thread_id,
+                       NumaReplicatedAccessToken       token) :
     // Unpack the SharedState struct into member variables
     thread_idx(thread_id),
+    numaAccessToken(token),
     manager(std::move(sm)),
     options(sharedState.options),
     threads(sharedState.threads),
     tt(sharedState.tt),
     networks(sharedState.networks),
-    refreshTable(networks) {
+    refreshTable(networks[token]) {
     clear();
 }
 
@@ -428,7 +430,7 @@ void Search::Worker::iterative_deepening() {
             skill.pick_best(rootMoves, multiPV);
 
         // Use part of the gained time from a previous stable move for the current move
-        for (Thread* th : threads)
+        for (auto&& th : threads)
         {
             totBestMoveChanges += th->worker->bestMoveChanges;
             th->worker->bestMoveChanges = 0;
@@ -510,7 +512,7 @@ void Search::Worker::clear() {
     for (size_t i = 1; i < reductions.size(); ++i)
         reductions[i] = int((19.90 + std::log(size_t(options["Threads"])) / 2) * std::log(i));
 
-    refreshTable.clear(networks);
+    refreshTable.clear(networks[numaAccessToken]);
 }
 
 
@@ -576,9 +578,9 @@ Value Search::Worker::search(
         // Step 2. Check for aborted search and immediate draw
         if (threads.stop.load(std::memory_order_relaxed) || pos.is_draw(ss->ply)
             || ss->ply >= MAX_PLY)
-            return (ss->ply >= MAX_PLY && !ss->inCheck)
-                   ? evaluate(networks, pos, refreshTable, thisThread->optimism[us])
-                   : value_draw(thisThread->nodes);
+            return (ss->ply >= MAX_PLY && !ss->inCheck) ? evaluate(
+                     networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us])
+                                                        : value_draw(thisThread->nodes);
 
         // Step 3. Mate distance pruning. Even if we mate at the next move our score
         // would be at best mate_in(ss->ply + 1), but if alpha is already bigger because
@@ -706,7 +708,7 @@ Value Search::Worker::search(
     {
         // Providing the hint that this node's accumulator will be used often
         // brings significant Elo gain (~13 Elo).
-        Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable);
+        Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable);
         unadjustedStaticEval = eval = ss->staticEval;
     }
     else if (ss->ttHit)
@@ -714,9 +716,10 @@ Value Search::Worker::search(
         // Never assume anything about values stored in TT
         unadjustedStaticEval = tte->eval();
         if (unadjustedStaticEval == VALUE_NONE)
-            unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]);
+            unadjustedStaticEval =
+              evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]);
         else if (PvNode)
-            Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable);
+            Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable);
 
         ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
@@ -726,7 +729,8 @@ Value Search::Worker::search(
     }
     else
     {
-        unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]);
+        unadjustedStaticEval =
+          evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]);
         ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
         // Static evaluation is saved as it was before adjustment by correction history
@@ -892,7 +896,7 @@ Value Search::Worker::search(
                 }
             }
 
-        Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable);
+        Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable);
     }
 
 moves_loop:  // When in check, search starts here
@@ -1441,7 +1445,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
     // Step 2. Check for an immediate draw or maximum ply reached
     if (pos.is_draw(ss->ply) || ss->ply >= MAX_PLY)
         return (ss->ply >= MAX_PLY && !ss->inCheck)
-               ? evaluate(networks, pos, refreshTable, thisThread->optimism[us])
+               ? evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us])
                : VALUE_DRAW;
 
     assert(0 <= ss->ply && ss->ply < MAX_PLY);
@@ -1476,7 +1480,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
             unadjustedStaticEval = tte->eval();
             if (unadjustedStaticEval == VALUE_NONE)
                 unadjustedStaticEval =
-                  evaluate(networks, pos, refreshTable, thisThread->optimism[us]);
+                  evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]);
             ss->staticEval = bestValue =
               to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
@@ -1488,10 +1492,11 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
         else
         {
             // In case of null move search, use previous static eval with a different sign
-            unadjustedStaticEval = (ss - 1)->currentMove != Move::null()
-                                   ? evaluate(networks, pos, refreshTable, thisThread->optimism[us])
-                                   : -(ss - 1)->staticEval;
-            ss->staticEval       = bestValue =
+            unadjustedStaticEval =
+              (ss - 1)->currentMove != Move::null()
+                ? evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us])
+                : -(ss - 1)->staticEval;
+            ss->staticEval = bestValue =
               to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
         }
 
diff --git a/src/search.h b/src/search.h
index 6e5b22bda32..a61f253c005 100644
--- a/src/search.h
+++ b/src/search.h
@@ -32,19 +32,17 @@
 
 #include "misc.h"
 #include "movepick.h"
+#include "nnue/network.h"
+#include "nnue/nnue_accumulator.h"
+#include "numa.h"
 #include "position.h"
 #include "score.h"
 #include "syzygy/tbprobe.h"
 #include "timeman.h"
 #include "types.h"
-#include "nnue/nnue_accumulator.h"
 
 namespace Stockfish {
 
-namespace Eval::NNUE {
-struct Networks;
-}
-
 // Different node types, used as a template parameter
 enum NodeType {
     NonPV,
@@ -133,19 +131,19 @@ struct LimitsType {
 // The UCI stores the uci options, thread pool, and transposition table.
 // This struct is used to easily forward data to the Search::Worker class.
 struct SharedState {
-    SharedState(const OptionsMap&           optionsMap,
-                ThreadPool&                 threadPool,
-                TranspositionTable&         transpositionTable,
-                const Eval::NNUE::Networks& nets) :
+    SharedState(const OptionsMap&                           optionsMap,
+                ThreadPool&                                 threadPool,
+                TranspositionTable&                         transpositionTable,
+                const NumaReplicated<Eval::NNUE::Networks>& nets) :
         options(optionsMap),
         threads(threadPool),
         tt(transpositionTable),
         networks(nets) {}
 
-    const OptionsMap&           options;
-    ThreadPool&                 threads;
-    TranspositionTable&         tt;
-    const Eval::NNUE::Networks& networks;
+    const OptionsMap&                           options;
+    ThreadPool&                                 threads;
+    TranspositionTable&                         tt;
+    const NumaReplicated<Eval::NNUE::Networks>& networks;
 };
 
 class Worker;
@@ -236,7 +234,7 @@ class NullSearchManager: public ISearchManager {
 // of the search history, and storing data required for the search.
 class Worker {
    public:
-    Worker(SharedState&, std::unique_ptr<ISearchManager>, size_t);
+    Worker(SharedState&, std::unique_ptr<ISearchManager>, size_t, NumaReplicatedAccessToken);
 
     // Called at instantiation to initialize Reductions tables
     // Reset histories, usually before a new game
@@ -293,7 +291,8 @@ class Worker {
     Depth     rootDepth, completedDepth;
     Value     rootDelta;
 
-    size_t thread_idx;
+    size_t                    thread_idx;
+    NumaReplicatedAccessToken numaAccessToken;
 
     // Reductions lookup table initialized at startup
     std::array<int, MAX_MOVES> reductions;  // [depth or moveNumber]
@@ -303,10 +302,10 @@ class Worker {
 
     Tablebases::Config tbConfig;
 
-    const OptionsMap&           options;
-    ThreadPool&                 threads;
-    TranspositionTable&         tt;
-    const Eval::NNUE::Networks& networks;
+    const OptionsMap&                           options;
+    ThreadPool&                                 threads;
+    TranspositionTable&                         tt;
+    const NumaReplicated<Eval::NNUE::Networks>& networks;
 
     // Used by NNUE
     Eval::NNUE::AccumulatorCaches refreshTable;
diff --git a/src/thread.cpp b/src/thread.cpp
index 8724cb49cd1..5893f4b6d07 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -22,19 +22,17 @@
 #include <cassert>
 #include <deque>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
-#include <string>
 
-#include "misc.h"
 #include "movegen.h"
 #include "search.h"
 #include "syzygy/tbprobe.h"
 #include "timeman.h"
-#include "tt.h"
 #include "types.h"
-#include "ucioption.h"
 #include "uci.h"
+#include "ucioption.h"
 
 namespace Stockfish {
 
@@ -42,13 +40,24 @@ namespace Stockfish {
 // in idle_loop(). Note that 'searching' and 'exit' should be already set.
 Thread::Thread(Search::SharedState&                    sharedState,
                std::unique_ptr<Search::ISearchManager> sm,
-               size_t                                  n) :
-    worker(std::make_unique<Search::Worker>(sharedState, std::move(sm), n)),
+               size_t                                  n,
+               OptionalThreadToNumaNodeBinder          binder) :
     idx(n),
     nthreads(sharedState.options["Threads"]),
     stdThread(&Thread::idle_loop, this) {
 
     wait_for_search_finished();
+
+    run_custom_job([this, &binder, &sharedState, &sm, n]() {
+        // Use the binder to [maybe] bind the threads to a NUMA node before doing
+        // the Worker allocation.
+        // Ideally we would also allocate the SearchManager here, but that's minor.
+        this->numaAccessToken = binder();
+        this->worker =
+          std::make_unique<Search::Worker>(sharedState, std::move(sm), n, this->numaAccessToken);
+    });
+
+    wait_for_search_finished();
 }
 
 
@@ -66,12 +75,15 @@ Thread::~Thread() {
 
 // Wakes up the thread that will start the search
 void Thread::start_searching() {
-    mutex.lock();
-    searching = true;
-    mutex.unlock();   // Unlock before notifying saves a few CPU-cycles
-    cv.notify_one();  // Wake up the thread in idle_loop()
+    assert(worker != nullptr);
+    run_custom_job([this]() { worker->start_searching(); });
 }
 
+// Wakes up the thread that will start the search
+void Thread::clear_worker() {
+    assert(worker != nullptr);
+    run_custom_job([this]() { worker->clear(); });
+}
 
 // Blocks on the condition variable
 // until the thread has finished searching.
@@ -81,20 +93,20 @@ void Thread::wait_for_search_finished() {
     cv.wait(lk, [&] { return !searching; });
 }
 
+void Thread::run_custom_job(std::function<void()> f) {
+    {
+        std::unique_lock<std::mutex> lk(mutex);
+        cv.wait(lk, [&] { return !searching; });
+        jobFunc   = std::move(f);
+        searching = true;
+    }
+    cv.notify_one();
+}
 
 // Thread gets parked here, blocked on the
 // condition variable, when it has no work to do.
 
 void Thread::idle_loop() {
-
-    // If OS already scheduled us on a different group than 0 then don't overwrite
-    // the choice, eventually we are one of many one-threaded processes running on
-    // some Windows NUMA hardware, for instance in fishtest. To make it simple,
-    // just check if running threads are below a threshold, in this case, all this
-    // NUMA machinery is not needed.
-    if (nthreads > 8)
-        WinProcGroup::bind_this_thread(idx);
-
     while (true)
     {
         std::unique_lock<std::mutex> lk(mutex);
@@ -105,9 +117,13 @@ void Thread::idle_loop() {
         if (exit)
             return;
 
+        std::function<void()> job = std::move(jobFunc);
+        jobFunc                   = nullptr;
+
         lk.unlock();
 
-        worker->start_searching();
+        if (job)
+            job();
     }
 }
 
@@ -121,49 +137,82 @@ uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits
 // Creates/destroys threads to match the requested number.
 // Created and launched threads will immediately go to sleep in idle_loop.
 // Upon resizing, threads are recreated to allow for binding if necessary.
-void ThreadPool::set(Search::SharedState                         sharedState,
+void ThreadPool::set(const NumaConfig&                           numaConfig,
+                     Search::SharedState                         sharedState,
                      const Search::SearchManager::UpdateContext& updateContext) {
 
     if (threads.size() > 0)  // destroy any existing thread(s)
     {
         main_thread()->wait_for_search_finished();
 
-        while (threads.size() > 0)
-            delete threads.back(), threads.pop_back();
+        threads.clear();
+
+        boundThreadToNumaNode.clear();
     }
 
     const size_t requested = sharedState.options["Threads"];
 
     if (requested > 0)  // create new thread(s)
     {
-        auto manager = std::make_unique<Search::SearchManager>(updateContext);
-        threads.push_back(new Thread(sharedState, std::move(manager), 0));
+        // Binding threads may be problematic when there's multiple NUMA nodes and
+        // multiple Stockfish instances running. In particular, if each instance
+        // runs a single thread then they would all be mapped to the first NUMA node.
+        // This is undesirable, and so the default behaviour (i.e. when the user does not
+        // change the NumaConfig UCI setting) is to not bind the threads to processors
+        // unless we know for sure that we span NUMA nodes and replication is required.
+        const std::string numaPolicy(sharedState.options["NumaPolicy"]);
+        const bool        doBindThreads = [&]() {
+            if (numaPolicy == "none")
+                return false;
+
+            if (numaPolicy == "auto")
+                return numaConfig.suggests_binding_threads(requested);
+
+            // numaPolicy == "system", or explicitly set by the user
+            return true;
+        }();
+
+        boundThreadToNumaNode = doBindThreads
+                                ? numaConfig.distribute_threads_among_numa_nodes(requested)
+                                : std::vector<NumaIndex>{};
 
         while (threads.size() < requested)
         {
-            auto null_manager = std::make_unique<Search::NullSearchManager>();
-            threads.push_back(new Thread(sharedState, std::move(null_manager), threads.size()));
+            const size_t    threadId = threads.size();
+            const NumaIndex numaId   = doBindThreads ? boundThreadToNumaNode[threadId] : 0;
+            auto            manager  = threadId == 0 ? std::unique_ptr<Search::ISearchManager>(
+                             std::make_unique<Search::SearchManager>(updateContext))
+                                                     : std::make_unique<Search::NullSearchManager>();
+
+            // When not binding threads we want to force all access to happen
+            // from the same NUMA node, because in case of NUMA replicated memory
+            // accesses we don't want to trash cache in case the threads get scheduled
+            // on the same NUMA node.
+            auto binder = doBindThreads ? OptionalThreadToNumaNodeBinder(numaConfig, numaId)
+                                        : OptionalThreadToNumaNodeBinder(numaId);
+
+            threads.emplace_back(
+              std::make_unique<Thread>(sharedState, std::move(manager), threadId, binder));
         }
 
         clear();
 
         main_thread()->wait_for_search_finished();
-
-        // Reallocate the hash with the new threadpool size
-        sharedState.tt.resize(sharedState.options["Hash"], requested);
     }
 }
 
 
 // Sets threadPool data to initial values
 void ThreadPool::clear() {
-
-    for (Thread* th : threads)
-        th->worker->clear();
-
     if (threads.size() == 0)
         return;
 
+    for (auto&& th : threads)
+        th->clear_worker();
+
+    for (auto&& th : threads)
+        th->wait_for_search_finished();
+
     main_manager()->callsCnt                 = 0;
     main_manager()->bestPreviousScore        = VALUE_INFINITE;
     main_manager()->bestPreviousAverageScore = VALUE_INFINITE;
@@ -172,6 +221,17 @@ void ThreadPool::clear() {
     main_manager()->tm.clear();
 }
 
+void ThreadPool::run_on_thread(size_t threadId, std::function<void()> f) {
+    assert(threads.size() > threadId);
+    threads[threadId]->run_custom_job(std::move(f));
+}
+
+void ThreadPool::wait_on_thread(size_t threadId) {
+    assert(threads.size() > threadId);
+    threads[threadId]->wait_for_search_finished();
+}
+
+size_t ThreadPool::num_threads() const { return threads.size(); }
 
 // Wakes up main thread waiting in idle_loop() and
 // returns immediately. Main thread will wake up other threads and start the search.
@@ -216,31 +276,36 @@ void ThreadPool::start_thinking(const OptionsMap&  options,
     // be deduced from a fen string, so set() clears them and they are set from
     // setupStates->back() later. The rootState is per thread, earlier states are shared
     // since they are read-only.
-    for (Thread* th : threads)
+    for (auto&& th : threads)
     {
-        th->worker->limits = limits;
-        th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly =
-          th->worker->bestMoveChanges          = 0;
-        th->worker->rootDepth = th->worker->completedDepth = 0;
-        th->worker->rootMoves                              = rootMoves;
-        th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState);
-        th->worker->rootState = setupStates->back();
-        th->worker->tbConfig  = tbConfig;
+        th->run_custom_job([&]() {
+            th->worker->limits = limits;
+            th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly =
+              th->worker->bestMoveChanges          = 0;
+            th->worker->rootDepth = th->worker->completedDepth = 0;
+            th->worker->rootMoves                              = rootMoves;
+            th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState);
+            th->worker->rootState = setupStates->back();
+            th->worker->tbConfig  = tbConfig;
+        });
     }
 
+    for (auto&& th : threads)
+        th->wait_for_search_finished();
+
     main_thread()->start_searching();
 }
 
 Thread* ThreadPool::get_best_thread() const {
 
-    Thread* bestThread = threads.front();
+    Thread* bestThread = threads.front().get();
     Value   minScore   = VALUE_NONE;
 
     std::unordered_map<Move, int64_t, Move::MoveHash> votes(
       2 * std::min(size(), bestThread->worker->rootMoves.size()));
 
     // Find the minimum score of all threads
-    for (Thread* th : threads)
+    for (auto&& th : threads)
         minScore = std::min(minScore, th->worker->rootMoves[0].score);
 
     // Vote according to score and depth, and select the best thread
@@ -248,10 +313,10 @@ Thread* ThreadPool::get_best_thread() const {
         return (th->worker->rootMoves[0].score - minScore + 14) * int(th->worker->completedDepth);
     };
 
-    for (Thread* th : threads)
-        votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th);
+    for (auto&& th : threads)
+        votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th.get());
 
-    for (Thread* th : threads)
+    for (auto&& th : threads)
     {
         const auto bestThreadScore = bestThread->worker->rootMoves[0].score;
         const auto newThreadScore  = th->worker->rootMoves[0].score;
@@ -272,26 +337,26 @@ Thread* ThreadPool::get_best_thread() const {
 
         // Note that we make sure not to pick a thread with truncated-PV for better viewer experience.
         const bool betterVotingValue =
-          thread_voting_value(th) * int(newThreadPV.size() > 2)
+          thread_voting_value(th.get()) * int(newThreadPV.size() > 2)
           > thread_voting_value(bestThread) * int(bestThreadPV.size() > 2);
 
         if (bestThreadInProvenWin)
         {
             // Make sure we pick the shortest mate / TB conversion
             if (newThreadScore > bestThreadScore)
-                bestThread = th;
+                bestThread = th.get();
         }
         else if (bestThreadInProvenLoss)
         {
             // Make sure we pick the shortest mated / TB conversion
             if (newThreadInProvenLoss && newThreadScore < bestThreadScore)
-                bestThread = th;
+                bestThread = th.get();
         }
         else if (newThreadInProvenWin || newThreadInProvenLoss
                  || (newThreadScore > VALUE_TB_LOSS_IN_MAX_PLY
                      && (newThreadMoveVote > bestThreadMoveVote
                          || (newThreadMoveVote == bestThreadMoveVote && betterVotingValue))))
-            bestThread = th;
+            bestThread = th.get();
     }
 
     return bestThread;
@@ -302,7 +367,7 @@ Thread* ThreadPool::get_best_thread() const {
 // Will be invoked by main thread after it has started searching
 void ThreadPool::start_searching() {
 
-    for (Thread* th : threads)
+    for (auto&& th : threads)
         if (th != threads.front())
             th->start_searching();
 }
@@ -312,9 +377,28 @@ void ThreadPool::start_searching() {
 
 void ThreadPool::wait_for_search_finished() const {
 
-    for (Thread* th : threads)
+    for (auto&& th : threads)
         if (th != threads.front())
             th->wait_for_search_finished();
 }
 
+std::vector<size_t> ThreadPool::get_bound_thread_count_by_numa_node() const {
+    std::vector<size_t> counts;
+
+    if (!boundThreadToNumaNode.empty())
+    {
+        NumaIndex highestNumaNode = 0;
+        for (NumaIndex n : boundThreadToNumaNode)
+            if (n > highestNumaNode)
+                highestNumaNode = n;
+
+        counts.resize(highestNumaNode + 1, 0);
+
+        for (NumaIndex n : boundThreadToNumaNode)
+            counts[n] += 1;
+    }
+
+    return counts;
+}
+
 }  // namespace Stockfish
diff --git a/src/thread.h b/src/thread.h
index 223652aec99..102b229907b 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -26,10 +26,12 @@
 #include <memory>
 #include <mutex>
 #include <vector>
+#include <functional>
 
 #include "position.h"
 #include "search.h"
 #include "thread_win32_osx.h"
+#include "numa.h"
 
 namespace Stockfish {
 
@@ -37,6 +39,32 @@ namespace Stockfish {
 class OptionsMap;
 using Value = int;
 
+// Sometimes we don't want to actually bind the threads, but the recipent still
+// needs to think it runs on *some* NUMA node, such that it can access structures
+// that rely on NUMA node knowledge. This class encapsulates this optional process
+// such that the recipent does not need to know whether the binding happened or not.
+class OptionalThreadToNumaNodeBinder {
+   public:
+    OptionalThreadToNumaNodeBinder(NumaIndex n) :
+        numaConfig(nullptr),
+        numaId(n) {}
+
+    OptionalThreadToNumaNodeBinder(const NumaConfig& cfg, NumaIndex n) :
+        numaConfig(&cfg),
+        numaId(n) {}
+
+    NumaReplicatedAccessToken operator()() const {
+        if (numaConfig != nullptr)
+            return numaConfig->bind_current_thread_to_numa_node(numaId);
+        else
+            return NumaReplicatedAccessToken(numaId);
+    }
+
+   private:
+    const NumaConfig* numaConfig;
+    NumaIndex         numaId;
+};
+
 // Abstraction of a thread. It contains a pointer to the worker and a native thread.
 // After construction, the native thread is started with idle_loop()
 // waiting for a signal to start searching.
@@ -44,22 +72,35 @@ using Value = int;
 // the search is finished, it goes back to idle_loop() waiting for a new signal.
 class Thread {
    public:
-    Thread(Search::SharedState&, std::unique_ptr<Search::ISearchManager>, size_t);
+    Thread(Search::SharedState&,
+           std::unique_ptr<Search::ISearchManager>,
+           size_t,
+           OptionalThreadToNumaNodeBinder);
     virtual ~Thread();
 
-    void   idle_loop();
-    void   start_searching();
+    void idle_loop();
+    void start_searching();
+    void clear_worker();
+    void run_custom_job(std::function<void()> f);
+
+    // Thread has been slightly altered to allow running custom jobs, so
+    // this name is no longer correct. However, this class (and ThreadPool)
+    // require further work to make them properly generic while maintaining
+    // appropriate specificity regarding search, from the point of view of an
+    // outside user, so renaming of this function in left for whenever that happens.
     void   wait_for_search_finished();
     size_t id() const { return idx; }
 
     std::unique_ptr<Search::Worker> worker;
+    std::function<void()>           jobFunc;
 
    private:
-    std::mutex              mutex;
-    std::condition_variable cv;
-    size_t                  idx, nthreads;
-    bool                    exit = false, searching = true;  // Set before starting std::thread
-    NativeThread            stdThread;
+    std::mutex                mutex;
+    std::condition_variable   cv;
+    size_t                    idx, nthreads;
+    bool                      exit = false, searching = true;  // Set before starting std::thread
+    NativeThread              stdThread;
+    NumaReplicatedAccessToken numaAccessToken;
 };
 
 
@@ -67,31 +108,44 @@ class Thread {
 // parking and, most importantly, launching a thread. All the access to threads
 // is done through this class.
 class ThreadPool {
-
    public:
+    ThreadPool() {}
+
     ~ThreadPool() {
         // destroy any existing thread(s)
         if (threads.size() > 0)
         {
             main_thread()->wait_for_search_finished();
 
-            while (threads.size() > 0)
-                delete threads.back(), threads.pop_back();
+            threads.clear();
         }
     }
 
-    void start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType);
-    void clear();
-    void set(Search::SharedState, const Search::SearchManager::UpdateContext&);
+    ThreadPool(const ThreadPool&) = delete;
+    ThreadPool(ThreadPool&&)      = delete;
+
+    ThreadPool& operator=(const ThreadPool&) = delete;
+    ThreadPool& operator=(ThreadPool&&)      = delete;
+
+    void   start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType);
+    void   run_on_thread(size_t threadId, std::function<void()> f);
+    void   wait_on_thread(size_t threadId);
+    size_t num_threads() const;
+    void   clear();
+    void   set(const NumaConfig& numaConfig,
+               Search::SharedState,
+               const Search::SearchManager::UpdateContext&);
 
     Search::SearchManager* main_manager();
-    Thread*                main_thread() const { return threads.front(); }
+    Thread*                main_thread() const { return threads.front().get(); }
     uint64_t               nodes_searched() const;
     uint64_t               tb_hits() const;
     Thread*                get_best_thread() const;
     void                   start_searching();
     void                   wait_for_search_finished() const;
 
+    std::vector<size_t> get_bound_thread_count_by_numa_node() const;
+
     std::atomic_bool stop, abortedSearch, increaseDepth;
 
     auto cbegin() const noexcept { return threads.cbegin(); }
@@ -102,13 +156,14 @@ class ThreadPool {
     auto empty() const noexcept { return threads.empty(); }
 
    private:
-    StateListPtr         setupStates;
-    std::vector<Thread*> threads;
+    StateListPtr                         setupStates;
+    std::vector<std::unique_ptr<Thread>> threads;
+    std::vector<NumaIndex>               boundThreadToNumaNode;
 
     uint64_t accumulate(std::atomic<uint64_t> Search::Worker::*member) const {
 
         uint64_t sum = 0;
-        for (Thread* th : threads)
+        for (auto&& th : threads)
             sum += (th->worker.get()->*member).load(std::memory_order_relaxed);
         return sum;
     }
diff --git a/src/tt.cpp b/src/tt.cpp
index 3f5b9d4d9b0..79274f525b9 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -23,10 +23,10 @@
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
-#include <thread>
-#include <vector>
 
 #include "misc.h"
+#include "syzygy/tbprobe.h"
+#include "thread.h"
 
 namespace Stockfish {
 
@@ -74,7 +74,7 @@ uint8_t TTEntry::relative_age(const uint8_t generation8) const {
 // Sets the size of the transposition table,
 // measured in megabytes. Transposition table consists
 // of clusters and each cluster consists of ClusterSize number of TTEntry.
-void TranspositionTable::resize(size_t mbSize, int threadCount) {
+void TranspositionTable::resize(size_t mbSize, ThreadPool& threads) {
     aligned_large_pages_free(table);
 
     clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
@@ -86,32 +86,29 @@ void TranspositionTable::resize(size_t mbSize, int threadCount) {
         exit(EXIT_FAILURE);
     }
 
-    clear(threadCount);
+    clear(threads);
 }
 
 
 // Initializes the entire transposition table to zero,
 // in a multi-threaded way.
-void TranspositionTable::clear(size_t threadCount) {
-    std::vector<std::thread> threads;
+void TranspositionTable::clear(ThreadPool& threads) {
+    const size_t threadCount = threads.num_threads();
 
-    for (size_t idx = 0; idx < size_t(threadCount); ++idx)
+    for (size_t i = 0; i < threadCount; ++i)
     {
-        threads.emplace_back([this, idx, threadCount]() {
-            // Thread binding gives faster search on systems with a first-touch policy
-            if (threadCount > 8)
-                WinProcGroup::bind_this_thread(idx);
-
+        threads.run_on_thread(i, [this, i, threadCount]() {
             // Each thread will zero its part of the hash table
-            const size_t stride = size_t(clusterCount / threadCount), start = size_t(stride * idx),
-                         len = idx != size_t(threadCount) - 1 ? stride : clusterCount - start;
+            const size_t stride = clusterCount / threadCount;
+            const size_t start  = stride * i;
+            const size_t len    = i + 1 != threadCount ? stride : clusterCount - start;
 
             std::memset(&table[start], 0, len * sizeof(Cluster));
         });
     }
 
-    for (std::thread& th : threads)
-        th.join();
+    for (size_t i = 0; i < threadCount; ++i)
+        threads.wait_on_thread(i);
 }
 
 
diff --git a/src/tt.h b/src/tt.h
index 7cc876fb9ea..3b09ec4e1d9 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -63,6 +63,7 @@ struct TTEntry {
     int16_t  eval16;
 };
 
+class ThreadPool;
 
 // A TranspositionTable is an array of Cluster, of size clusterCount. Each
 // cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry
@@ -102,8 +103,8 @@ class TranspositionTable {
 
     TTEntry* probe(const Key key, bool& found) const;
     int      hashfull() const;
-    void     resize(size_t mbSize, int threadCount);
-    void     clear(size_t threadCount);
+    void     resize(size_t mbSize, ThreadPool& threads);
+    void     clear(ThreadPool& threads);
 
     TTEntry* first_entry(const Key key) const {
         return &table[mul_hi64(key, clusterCount)].entry[0];
diff --git a/src/uci.cpp b/src/uci.cpp
index cb686a027db..ab0dae3946e 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -60,7 +60,16 @@ UCIEngine::UCIEngine(int argc, char** argv) :
 
     options["Debug Log File"] << Option("", [](const Option& o) { start_logger(o); });
 
-    options["Threads"] << Option(1, 1, 1024, [this](const Option&) { engine.resize_threads(); });
+    options["NumaPolicy"] << Option("auto", [this](const Option& o) {
+        engine.set_numa_config_from_option(o);
+        print_numa_config_information();
+        print_thread_binding_information();
+    });
+
+    options["Threads"] << Option(1, 1, 1024, [this](const Option&) {
+        engine.resize_threads();
+        print_thread_binding_information();
+    });
 
     options["Hash"] << Option(16, 1, MaxHashMB, [this](const Option& o) { engine.set_tt_size(o); });
 
@@ -123,8 +132,15 @@ void UCIEngine::loop() {
             engine.set_ponderhit(false);
 
         else if (token == "uci")
+        {
             sync_cout << "id name " << engine_info(true) << "\n"
-                      << engine.get_options() << "\nuciok" << sync_endl;
+                      << engine.get_options() << sync_endl;
+
+            print_numa_config_information();
+            print_thread_binding_information();
+
+            sync_cout << "uciok" << sync_endl;
+        }
 
         else if (token == "setoption")
             setoption(is);
@@ -177,6 +193,28 @@ void UCIEngine::loop() {
     } while (token != "quit" && cli.argc == 1);  // The command-line arguments are one-shot
 }
 
+void UCIEngine::print_numa_config_information() const {
+    auto cfgStr = engine.get_numa_config_as_string();
+    sync_cout << "info string Available Processors: " << cfgStr << sync_endl;
+}
+
+void UCIEngine::print_thread_binding_information() const {
+    auto boundThreadsByNode = engine.get_bound_thread_count_by_numa_node();
+    if (!boundThreadsByNode.empty())
+    {
+        sync_cout << "info string NUMA Node Thread Binding: ";
+        bool isFirst = true;
+        for (auto&& [current, total] : boundThreadsByNode)
+        {
+            if (!isFirst)
+                std::cout << ":";
+            std::cout << current << "/" << total;
+            isFirst = false;
+        }
+        std::cout << sync_endl;
+    }
+}
+
 Search::LimitsType UCIEngine::parse_limits(std::istream& is) {
     Search::LimitsType limits;
     std::string        token;
diff --git a/src/uci.h b/src/uci.h
index 55d580f9727..bac62bb90c0 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -42,6 +42,9 @@ class UCIEngine {
 
     void loop();
 
+    void print_numa_config_information() const;
+    void print_thread_binding_information() const;
+
     static int         to_cp(Value v, const Position& pos);
     static std::string format_score(const Score& s);
     static std::string square(Square s);
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index e1ffe546525..4819a68db73 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -118,6 +118,8 @@ bool Option::operator==(const char* s) const {
     return !CaseInsensitiveLess()(currentValue, s) && !CaseInsensitiveLess()(s, currentValue);
 }
 
+bool Option::operator!=(const char* s) const { return !(*this == s); }
+
 
 // Inits options and assigns idx in the correct printing order
 
diff --git a/src/ucioption.h b/src/ucioption.h
index b575d1646e6..16d46696145 100644
--- a/src/ucioption.h
+++ b/src/ucioption.h
@@ -67,6 +67,7 @@ class Option {
     operator int() const;
     operator std::string() const;
     bool operator==(const char*) const;
+    bool operator!=(const char*) const;
 
     friend std::ostream& operator<<(std::ostream&, const OptionsMap&);