Skip to content

Commit

Permalink
Merge pull request #2270 from igchor/deferred_event_deallocation
Browse files Browse the repository at this point in the history
[L0 v2] Use single command list for all operations and implement deferred event deallocation
  • Loading branch information
pbalcer authored Nov 8, 2024
2 parents 2aaa261 + f48ae3d commit 9ae8e65
Show file tree
Hide file tree
Showing 15 changed files with 336 additions and 276 deletions.
3 changes: 3 additions & 0 deletions scripts/templates/queue_api.hpp.mako
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ from templates import helper as th

struct ur_queue_handle_t_ {
virtual ~ur_queue_handle_t_();

virtual void deferEventFree(ur_event_handle_t hEvent) = 0;

%for obj in th.get_queue_related_functions(specs, n, tags):
virtual ${x}_result_t ${th.transform_queue_related_function_name(n, tags, obj, format=["type"])} = 0;
%endfor
Expand Down
30 changes: 27 additions & 3 deletions source/adapters/level_zero/v2/command_list_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,21 @@

#include "../device.hpp"

typedef struct _zex_intel_queue_copy_operations_offload_hint_exp_desc_t {
ze_structure_type_t stype;
const void *pNext;
ze_bool_t copyOffloadEnabled;
} zex_intel_queue_copy_operations_offload_hint_exp_desc_t;

#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES \
(ze_structure_type_t)0x0003001B

template <>
ze_structure_type_t
getZeStructureType<zex_intel_queue_copy_operations_offload_hint_exp_desc_t>() {
return ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES;
}

bool v2::immediate_command_list_descriptor_t::operator==(
const immediate_command_list_descriptor_t &rhs) const {
return ZeDevice == rhs.ZeDevice && IsInOrder == rhs.IsInOrder &&
Expand Down Expand Up @@ -45,6 +60,10 @@ command_list_cache_t::command_list_cache_t(ze_context_handle_t ZeContext)

raii::ze_command_list_handle_t
command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) {
ZeStruct<zex_intel_queue_copy_operations_offload_hint_exp_desc_t> offloadDesc;
offloadDesc.copyOffloadEnabled =
std::visit([](auto &&arg) { return arg.CopyOffloadEnabled; }, desc);

if (auto ImmCmdDesc =
std::get_if<immediate_command_list_descriptor_t>(&desc)) {
ze_command_list_handle_t ZeCommandList;
Expand All @@ -58,6 +77,7 @@ command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) {
QueueDesc.flags |= ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
QueueDesc.index = ImmCmdDesc->Index.value();
}
QueueDesc.pNext = &offloadDesc;
ZE2UR_CALL_THROWS(
zeCommandListCreateImmediate,
(ZeContext, ImmCmdDesc->ZeDevice, &QueueDesc, &ZeCommandList));
Expand All @@ -68,6 +88,7 @@ command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) {
CmdListDesc.flags =
RegCmdDesc.IsInOrder ? ZE_COMMAND_LIST_FLAG_IN_ORDER : 0;
CmdListDesc.commandQueueGroupOrdinal = RegCmdDesc.Ordinal;
CmdListDesc.pNext = &offloadDesc;

ze_command_list_handle_t ZeCommandList;
ZE2UR_CALL_THROWS(zeCommandListCreate, (ZeContext, RegCmdDesc.ZeDevice,
Expand All @@ -78,13 +99,14 @@ command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) {

raii::command_list_unique_handle command_list_cache_t::getImmediateCommandList(
ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal,
ze_command_queue_mode_t Mode, ze_command_queue_priority_t Priority,
std::optional<uint32_t> Index) {
bool CopyOffloadEnable, ze_command_queue_mode_t Mode,
ze_command_queue_priority_t Priority, std::optional<uint32_t> Index) {
TRACK_SCOPE_LATENCY("command_list_cache_t::getImmediateCommandList");

immediate_command_list_descriptor_t Desc;
Desc.ZeDevice = ZeDevice;
Desc.Ordinal = Ordinal;
Desc.CopyOffloadEnabled = CopyOffloadEnable;
Desc.IsInOrder = IsInOrder;
Desc.Mode = Mode;
Desc.Priority = Priority;
Expand All @@ -99,13 +121,15 @@ raii::command_list_unique_handle command_list_cache_t::getImmediateCommandList(

raii::command_list_unique_handle
command_list_cache_t::getRegularCommandList(ze_device_handle_t ZeDevice,
bool IsInOrder, uint32_t Ordinal) {
bool IsInOrder, uint32_t Ordinal,
bool CopyOffloadEnable) {
TRACK_SCOPE_LATENCY("command_list_cache_t::getRegularCommandList");

regular_command_list_descriptor_t Desc;
Desc.ZeDevice = ZeDevice;
Desc.IsInOrder = IsInOrder;
Desc.Ordinal = Ordinal;
Desc.CopyOffloadEnabled = CopyOffloadEnable;

auto [CommandList, _] = getCommandList(Desc).release();

Expand Down
7 changes: 5 additions & 2 deletions source/adapters/level_zero/v2/command_list_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ struct immediate_command_list_descriptor_t {
ze_device_handle_t ZeDevice;
bool IsInOrder;
uint32_t Ordinal;
bool CopyOffloadEnabled;
ze_command_queue_mode_t Mode;
ze_command_queue_priority_t Priority;
std::optional<uint32_t> Index;
Expand All @@ -40,6 +41,7 @@ struct regular_command_list_descriptor_t {
ze_device_handle_t ZeDevice;
bool IsInOrder;
uint32_t Ordinal;
bool CopyOffloadEnabled;
bool operator==(const regular_command_list_descriptor_t &rhs) const;
};

Expand All @@ -56,12 +58,13 @@ struct command_list_cache_t {

raii::command_list_unique_handle
getImmediateCommandList(ze_device_handle_t ZeDevice, bool IsInOrder,
uint32_t Ordinal, ze_command_queue_mode_t Mode,
uint32_t Ordinal, bool CopyOffloadEnable,
ze_command_queue_mode_t Mode,
ze_command_queue_priority_t Priority,
std::optional<uint32_t> Index = std::nullopt);
raii::command_list_unique_handle
getRegularCommandList(ze_device_handle_t ZeDevice, bool IsInOrder,
uint32_t Ordinal);
uint32_t Ordinal, bool CopyOffloadEnable);

// For testing purposes
size_t getNumImmediateCommandLists();
Expand Down
60 changes: 50 additions & 10 deletions source/adapters/level_zero/v2/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "event.hpp"
#include "event_pool.hpp"
#include "event_provider.hpp"
#include "queue_api.hpp"

#include "../ur_interface_loader.hpp"

Expand All @@ -24,6 +25,12 @@ ur_event_handle_t_::ur_event_handle_t_(
zeTimerResolution(getDevice()->ZeDeviceProperties->timerResolution),
timestampMaxValue(getDevice()->getTimestampMask()) {}

void ur_event_handle_t_::resetQueueAndCommand(ur_queue_handle_t hQueue,
ur_command_t commandType) {
this->hQueue = hQueue;
this->commandType = commandType;
}

void ur_event_handle_t_::reset() {
// consider make an abstraction for regular/counter based
// events if there's more of this type of conditions
Expand All @@ -33,6 +40,8 @@ void ur_event_handle_t_::reset() {
}

ze_event_handle_t ur_event_handle_t_::getZeEvent() const {
assert(hQueue);
assert(commandType != UR_COMMAND_FORCE_UINT32);
return zeEvent.get();
}

Expand All @@ -41,14 +50,27 @@ ur_result_t ur_event_handle_t_::retain() {
return UR_RESULT_SUCCESS;
}

ur_result_t ur_event_handle_t_::releaseDeferred() {
assert(zeEventQueryStatus(zeEvent.get()) == ZE_RESULT_SUCCESS);
assert(RefCount.load() == 0);

pool->free(this);
return UR_RESULT_SUCCESS;
}

ur_result_t ur_event_handle_t_::release() {
if (!RefCount.decrementAndTest())
return UR_RESULT_SUCCESS;

// Need to take a lock before checking if the event is timestamped.
std::unique_lock<ur_shared_mutex> lock(Mutex);

if (isTimestamped() && adjustedEventEndTimestamp == 0) {
// L0 will write end timestamp to this event some time in the future,
// so we can't release it yet.
// TODO: delay releasing until the end timestamp is written.

assert(hQueue);
hQueue->deferEventFree(this);
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -99,17 +121,16 @@ uint64_t ur_event_handle_t_::getEventEndTimestamp() {
if (adjustedEventEndTimestamp)
return adjustedEventEndTimestamp;

// If the result is 0, we have not yet gotten results back and so we just
// return it.
if (recordEventEndTimestamp == 0)
return recordEventEndTimestamp;
auto status = zeEventQueryStatus(zeEvent.get());
if (status != ZE_RESULT_SUCCESS) {
// profiling info not ready
return 0;
}

// Now that we have the result, there is no need to keep it in the queue
// anymore, so we cache it on the event and evict the record from the
// queue.
adjustedEventEndTimestamp =
adjustEndEventTimestamp(getEventStartTimestmap(), recordEventEndTimestamp,
timestampMaxValue, zeTimerResolution);

return adjustedEventEndTimestamp;
}

Expand All @@ -118,13 +139,19 @@ void ur_event_handle_t_::recordStartTimestamp() {
UR_CALL_THROWS(ur::level_zero::urDeviceGetGlobalTimestamps(
getDevice(), &deviceStartTimestamp, nullptr));

assert(adjustedEventStartTimestamp == 0);
adjustedEventStartTimestamp = deviceStartTimestamp;
}

uint64_t *ur_event_handle_t_::getEventEndTimestampPtr() {
return &recordEventEndTimestamp;
std::pair<uint64_t *, ze_event_handle_t>
ur_event_handle_t_::getEventEndTimestampAndHandle() {
return {&recordEventEndTimestamp, zeEvent.get()};
}

ur_queue_handle_t ur_event_handle_t_::getQueue() const { return hQueue; }

ur_command_t ur_event_handle_t_::getCommandType() const { return commandType; }

namespace ur::level_zero {
ur_result_t urEventRetain(ur_event_handle_t hEvent) { return hEvent->retain(); }

Expand Down Expand Up @@ -159,6 +186,19 @@ ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName,
case UR_EVENT_INFO_REFERENCE_COUNT: {
return returnValue(hEvent->RefCount.load());
}
case UR_EVENT_INFO_COMMAND_QUEUE: {
return returnValue(ur_queue_handle_t{hEvent->getQueue()});
}
case UR_EVENT_INFO_CONTEXT: {
ur_context_handle_t hContext;
UR_CALL(::ur::level_zero::urQueueGetInfo(
hEvent->getQueue(), UR_QUEUE_INFO_CONTEXT, sizeof(hContext),
reinterpret_cast<void *>(&hContext), nullptr));
return returnValue(hContext);
}
case UR_EVENT_INFO_COMMAND_TYPE: {
return returnValue(hEvent->getCommandType());
}
default:
logger::error(
"Unsupported ParamName in urEventGetInfo: ParamName=ParamName={}(0x{})",
Expand Down
20 changes: 19 additions & 1 deletion source/adapters/level_zero/v2/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,19 @@ struct ur_event_handle_t_ : _ur_object {
ur_event_handle_t_(v2::raii::cache_borrowed_event eventAllocation,
v2::event_pool *pool);

// Set the queue and command that this event is associated with
void resetQueueAndCommand(ur_queue_handle_t hQueue, ur_command_t commandType);

void reset();
ze_event_handle_t getZeEvent() const;

ur_result_t retain();
ur_result_t release();

// releases a signaled and no longer in-use event, that's on the
// deffered events list in the queue
ur_result_t releaseDeferred();

// Tells if this event was created as a timestamp event, allowing profiling
// info even if profiling is not enabled.
bool isTimestamped() const;
Expand All @@ -43,13 +50,24 @@ struct ur_event_handle_t_ : _ur_object {
// Device associated with this event
ur_device_handle_t getDevice() const;

// Queue associated with this event
ur_queue_handle_t getQueue() const;

// Get the type of the command that this event is associated with
ur_command_t getCommandType() const;

void recordStartTimestamp();
uint64_t *getEventEndTimestampPtr();

// Get pointer to the end timestamp, and ze event handle.
// Caller is responsible for signaling the event once the timestamp is ready.
std::pair<uint64_t *, ze_event_handle_t> getEventEndTimestampAndHandle();

uint64_t getEventStartTimestmap() const;
uint64_t getEventEndTimestamp();

private:
ur_queue_handle_t hQueue = nullptr;
ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
v2::raii::cache_borrowed_event zeEvent;
v2::event_pool *pool;

Expand Down
5 changes: 4 additions & 1 deletion source/adapters/level_zero/v2/event_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ namespace v2 {

static constexpr size_t EVENTS_BURST = 64;

ur_event_handle_t_ *event_pool::allocate() {
ur_event_handle_t_ *event_pool::allocate(ur_queue_handle_t hQueue,
ur_command_t commandType) {
TRACK_SCOPE_LATENCY("event_pool::allocate");

std::unique_lock<std::mutex> lock(*mutex);
Expand All @@ -32,6 +33,8 @@ ur_event_handle_t_ *event_pool::allocate() {
auto event = freelist.back();
freelist.pop_back();

event->resetQueueAndCommand(hQueue, commandType);

return event;
}

Expand Down
3 changes: 2 additions & 1 deletion source/adapters/level_zero/v2/event_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ class event_pool {
DeviceId Id() { return provider->device()->Id.value(); };

// Allocate an event from the pool. Thread safe.
ur_event_handle_t_ *allocate();
ur_event_handle_t_ *allocate(ur_queue_handle_t hQueue,
ur_command_t commandType);

// Free an event back to the pool. Thread safe.
void free(ur_event_handle_t_ *event);
Expand Down
2 changes: 1 addition & 1 deletion source/adapters/level_zero/v2/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
hDevice
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
.ZeOrdinal,
ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
true, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
std::nullopt);

ZE2UR_CALL(zeCommandListAppendMemoryCopy,
Expand Down
3 changes: 3 additions & 0 deletions source/adapters/level_zero/v2/queue_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

struct ur_queue_handle_t_ {
virtual ~ur_queue_handle_t_();

virtual void deferEventFree(ur_event_handle_t hEvent) = 0;

virtual ur_result_t queueGetInfo(ur_queue_info_t, size_t, void *,
size_t *) = 0;
virtual ur_result_t queueRetain() = 0;
Expand Down
Loading

0 comments on commit 9ae8e65

Please sign in to comment.