From 515bc9a264c1e44cfcada0405a74ab2fba428e10 Mon Sep 17 00:00:00 2001 From: Aleksey Shipilev Date: Thu, 30 May 2024 07:35:57 +0000 Subject: [PATCH] 8318986: Improve GenericWaitBarrier performance Backport-of: 30462f9da40d3a7ec18fcf46e2154fabb5fd4753 --- .../share/utilities/waitBarrier_generic.cpp | 261 ++++++++++++++---- .../share/utilities/waitBarrier_generic.hpp | 74 ++++- 2 files changed, 274 insertions(+), 61 deletions(-) diff --git a/src/hotspot/share/utilities/waitBarrier_generic.cpp b/src/hotspot/share/utilities/waitBarrier_generic.cpp index b5d9ff67eb7..dbf4db336c2 100644 --- a/src/hotspot/share/utilities/waitBarrier_generic.cpp +++ b/src/hotspot/share/utilities/waitBarrier_generic.cpp @@ -1,5 +1,6 @@ /* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,66 +30,228 @@ #include "utilities/waitBarrier_generic.hpp" #include "utilities/spinYield.hpp" +// Implements the striped semaphore wait barrier. +// +// To guarantee progress and safety, we need to make sure that new barrier tag +// starts with the completely empty set of waiters and free semaphore. This +// requires either waiting for all threads to leave wait() for current barrier +// tag on disarm(), or waiting for all threads to leave the previous tag before +// reusing the semaphore in arm(). +// +// When there are multiple threads, it is normal for some threads to take +// significant time to leave the barrier. Waiting for these threads introduces +// stalls on barrier reuse. +// +// If we wait on disarm(), this stall is nearly guaranteed to happen if some threads +// are de-scheduled by prior wait(). It would be especially bad if there are more +// waiting threads than CPUs: every thread would need to wake up and register itself +// as leaving, before we can unblock from disarm(). +// +// If we wait on arm(), we can get lucky that most threads would be able to catch up, +// exit wait(), and so we arrive to arm() with semaphore ready for reuse. However, +// that is still insufficient in practice. +// +// Therefore, this implementation goes a step further and implements the _striped_ +// semaphores. We maintain several semaphores in cells. The barrier tags are assigned +// to cells in some simple manner. Most of the current uses have sequential barrier +// tags, so simple modulo works well. We then operate on a cell like we would operate +// on a single semaphore: we wait at arm() for all threads to catch up before reusing +// the cell. For the cost of maintaining just a few cells, we have enough window for +// threads to catch up. +// +// The correctness is guaranteed by using a single atomic state variable per cell, +// with updates always done with CASes: +// +// [.......... barrier tag ..........][.......... waiters ..........] +// 63 31 0 +// +// Cell starts with zero tag and zero waiters. Arming the cell swings barrier tag from +// zero to some tag, while checking that no waiters have appeared. Disarming swings +// the barrier tag back from tag to zero. Every waiter registers itself by incrementing +// the "waiters", while checking that barrier tag is still the same. Every completing waiter +// decrements the "waiters". When all waiters complete, a cell ends up in initial state, +// ready to be armed again. This allows accurate tracking of how many signals +// to issue and does not race with disarm. +// +// The implementation uses the strongest (default) barriers for extra safety, even +// when not strictly required to do so for correctness. Extra barrier overhead is +// dominated by the actual wait/notify latency anyway. +// + void GenericWaitBarrier::arm(int barrier_tag) { - assert(_barrier_tag == 0, "Already armed"); - assert(_waiters == 0, "We left a thread hanging"); - _barrier_tag = barrier_tag; - _waiters = 0; + assert(barrier_tag != 0, "Pre arm: Should be arming with armed value"); + assert(Atomic::load(&_barrier_tag) == 0, + "Pre arm: Should not be already armed. Tag: %d", + Atomic::load(&_barrier_tag)); + Atomic::release_store(&_barrier_tag, barrier_tag); + + Cell &cell = tag_to_cell(barrier_tag); + cell.arm(barrier_tag); + + // API specifies arm() must provide a trailing fence. OrderAccess::fence(); } -int GenericWaitBarrier::wake_if_needed() { - assert(_barrier_tag == 0, "Not disarmed"); - int w = _waiters; - if (w == 0) { - // Load of _barrier_threads in caller must not pass the load of _waiters. - OrderAccess::loadload(); - return 0; - } - assert(w > 0, "Bad counting"); - // We need an exact count which never goes below zero, - // otherwise the semaphore may be signalled too many times. - if (Atomic::cmpxchg(&_waiters, w, w - 1) == w) { - _sem_barrier.signal(); - return w - 1; - } - return w; +void GenericWaitBarrier::disarm() { + int barrier_tag = Atomic::load_acquire(&_barrier_tag); + assert(barrier_tag != 0, "Pre disarm: Should be armed. Tag: %d", barrier_tag); + Atomic::release_store(&_barrier_tag, 0); + + Cell &cell = tag_to_cell(barrier_tag); + cell.disarm(barrier_tag); + + // API specifies disarm() must provide a trailing fence. + OrderAccess::fence(); } -void GenericWaitBarrier::disarm() { - assert(_barrier_tag != 0, "Not armed"); - _barrier_tag = 0; - // Loads of _barrier_threads/_waiters must not float above disarm store and - // disarm store must not sink below. +void GenericWaitBarrier::wait(int barrier_tag) { + assert(barrier_tag != 0, "Pre wait: Should be waiting on armed value"); + + Cell &cell = tag_to_cell(barrier_tag); + cell.wait(barrier_tag); + + // API specifies wait() must provide a trailing fence. OrderAccess::fence(); - int left; +} + +void GenericWaitBarrier::Cell::arm(int32_t requested_tag) { + // Before we continue to arm, we need to make sure that all threads + // have left the previous cell. + + int64_t state; + SpinYield sp; - do { - left = GenericWaitBarrier::wake_if_needed(); - if (left == 0 && _barrier_threads > 0) { - // There is no thread to wake but we still have barrier threads. + while (true) { + state = Atomic::load_acquire(&_state); + assert(decode_tag(state) == 0, + "Pre arm: Should not be armed. " + "Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT, + decode_tag(state), decode_waiters(state)); + if (decode_waiters(state) == 0) { + break; + } + sp.wait(); + } + + // Try to swing cell to armed. This should always succeed after the check above. + int64_t new_state = encode(requested_tag, 0); + int64_t prev_state = Atomic::cmpxchg(&_state, state, new_state); + if (prev_state != state) { + fatal("Cannot arm the wait barrier. " + "Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT, + decode_tag(prev_state), decode_waiters(prev_state)); + } +} + +int GenericWaitBarrier::Cell::signal_if_needed(int max) { + int signals = 0; + while (true) { + int cur = Atomic::load_acquire(&_outstanding_wakeups); + if (cur == 0) { + // All done, no more waiters. + return 0; + } + assert(cur > 0, "Sanity"); + + int prev = Atomic::cmpxchg(&_outstanding_wakeups, cur, cur - 1); + if (prev != cur) { + // Contention, return to caller for early return or backoff. + return prev; + } + + // Signal! + _sem.signal(); + + if (++signals >= max) { + // Signalled requested number of times, break out. + return prev; + } + } +} + +void GenericWaitBarrier::Cell::disarm(int32_t expected_tag) { + int32_t waiters; + + while (true) { + int64_t state = Atomic::load_acquire(&_state); + int32_t tag = decode_tag(state); + waiters = decode_waiters(state); + + assert((tag == expected_tag) && (waiters >= 0), + "Mid disarm: Should be armed with expected tag and have sane waiters. " + "Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT, + tag, waiters); + + int64_t new_state = encode(0, waiters); + if (Atomic::cmpxchg(&_state, state, new_state) == state) { + // Successfully disarmed. + break; + } + } + + // Wake up waiters, if we have at least one. + // Allow other threads to assist with wakeups, if possible. + if (waiters > 0) { + Atomic::release_store(&_outstanding_wakeups, waiters); + SpinYield sp; + while (signal_if_needed(INT_MAX) > 0) { sp.wait(); } - // We must loop here until there are no waiters or potential waiters. - } while (left > 0 || _barrier_threads > 0); - // API specifies disarm() must provide a trailing fence. - OrderAccess::fence(); + } + assert(Atomic::load(&_outstanding_wakeups) == 0, "Post disarm: Should not have outstanding wakeups"); } -void GenericWaitBarrier::wait(int barrier_tag) { - assert(barrier_tag != 0, "Trying to wait on disarmed value"); - if (barrier_tag != _barrier_tag) { - // API specifies wait() must provide a trailing fence. - OrderAccess::fence(); - return; +void GenericWaitBarrier::Cell::wait(int32_t expected_tag) { + // Try to register ourselves as pending waiter. + while (true) { + int64_t state = Atomic::load_acquire(&_state); + int32_t tag = decode_tag(state); + if (tag != expected_tag) { + // Cell tag had changed while waiting here. This means either the cell had + // been disarmed, or we are late and the cell was armed with a new tag. + // Exit without touching anything else. + return; + } + int32_t waiters = decode_waiters(state); + + assert((tag == expected_tag) && (waiters >= 0 && waiters < INT32_MAX), + "Before wait: Should be armed with expected tag and waiters are in range. " + "Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT, + tag, waiters); + + int64_t new_state = encode(tag, waiters + 1); + if (Atomic::cmpxchg(&_state, state, new_state) == state) { + // Success! Proceed to wait. + break; + } } - Atomic::add(&_barrier_threads, 1); - if (barrier_tag != 0 && barrier_tag == _barrier_tag) { - Atomic::add(&_waiters, 1); - _sem_barrier.wait(); - // We help out with posting, but we need to do so before we decrement the - // _barrier_threads otherwise we might wake threads up in next wait. - GenericWaitBarrier::wake_if_needed(); + + // Wait for notification. + _sem.wait(); + + // Unblocked! We help out with waking up two siblings. This allows to avalanche + // the wakeups for many threads, even if some threads are lagging behind. + // Note that we can only do this *before* reporting back as completed waiter, + // otherwise we might prematurely wake up threads for another barrier tag. + // Current arm() sequence protects us from this trouble by waiting until all waiters + // leave. + signal_if_needed(2); + + // Register ourselves as completed waiter before leaving. + while (true) { + int64_t state = Atomic::load_acquire(&_state); + int32_t tag = decode_tag(state); + int32_t waiters = decode_waiters(state); + + assert((tag == 0) && (waiters > 0), + "After wait: Should be not armed and have non-complete waiters. " + "Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT, + tag, waiters); + + int64_t new_state = encode(tag, waiters - 1); + if (Atomic::cmpxchg(&_state, state, new_state) == state) { + // Success! + break; + } } - Atomic::add(&_barrier_threads, -1); } diff --git a/src/hotspot/share/utilities/waitBarrier_generic.hpp b/src/hotspot/share/utilities/waitBarrier_generic.hpp index 50bfea6aebf..d3a45b33b82 100644 --- a/src/hotspot/share/utilities/waitBarrier_generic.hpp +++ b/src/hotspot/share/utilities/waitBarrier_generic.hpp @@ -26,29 +26,79 @@ #define SHARE_UTILITIES_WAITBARRIER_GENERIC_HPP #include "memory/allocation.hpp" +#include "memory/padded.hpp" #include "runtime/semaphore.hpp" #include "utilities/globalDefinitions.hpp" -// In addition to the barrier tag, it uses two counters to keep the semaphore -// count correct and not leave any late thread waiting. class GenericWaitBarrier : public CHeapObj { +private: + class Cell : public CHeapObj { + private: + // Pad out the cells to avoid interference between the cells. + // This would insulate from stalls when adjacent cells have returning + // workers and contend over the cache line for current latency-critical + // cell. + DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0); + + Semaphore _sem; + + // Cell state, tracks the arming + waiters status + volatile int64_t _state; + + // Wakeups to deliver for current waiters + volatile int _outstanding_wakeups; + + int signal_if_needed(int max); + + static int64_t encode(int32_t barrier_tag, int32_t waiters) { + int64_t val = (((int64_t) barrier_tag) << 32) | + (((int64_t) waiters) & 0xFFFFFFFF); + assert(decode_tag(val) == barrier_tag, "Encoding is reversible"); + assert(decode_waiters(val) == waiters, "Encoding is reversible"); + return val; + } + + static int32_t decode_tag(int64_t value) { + return (int32_t)(value >> 32); + } + + static int32_t decode_waiters(int64_t value) { + return (int32_t)(value & 0xFFFFFFFF); + } + + public: + Cell() : _sem(0), _state(encode(0, 0)), _outstanding_wakeups(0) {} + NONCOPYABLE(Cell); + + void arm(int32_t requested_tag); + void disarm(int32_t expected_tag); + void wait(int32_t expected_tag); + }; + + // Should be enough for most uses without exploding the footprint. + static constexpr int CELLS_COUNT = 16; + + Cell _cells[CELLS_COUNT]; + + // Trailing padding to protect the last cell. + DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0); + volatile int _barrier_tag; - // The number of threads waiting on or about to wait on the semaphore. - volatile int _waiters; - // The number of threads in the wait path, before or after the tag check. - // These threads can become waiters. - volatile int _barrier_threads; - Semaphore _sem_barrier; + + // Trailing padding to insulate the rest of the barrier from adjacent + // data structures. The leading padding is not needed, as cell padding + // handles this for us. + DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0); NONCOPYABLE(GenericWaitBarrier); - int wake_if_needed(); + Cell& tag_to_cell(int tag) { return _cells[tag & (CELLS_COUNT - 1)]; } - public: - GenericWaitBarrier() : _barrier_tag(0), _waiters(0), _barrier_threads(0), _sem_barrier(0) {} +public: + GenericWaitBarrier() : _cells(), _barrier_tag(0) {} ~GenericWaitBarrier() {} - const char* description() { return "semaphore"; } + const char* description() { return "striped semaphore"; } void arm(int barrier_tag); void disarm();