diff --git a/shortfin/python/array_binding.cc b/shortfin/python/array_binding.cc index da7197b14..a05232674 100644 --- a/shortfin/python/array_binding.cc +++ b/shortfin/python/array_binding.cc @@ -7,6 +7,7 @@ #include "./lib_ext.h" #include "./utils.h" #include "shortfin/array/api.h" +#include "shortfin/support/logging.h" using namespace shortfin::array; @@ -223,6 +224,7 @@ class PyMapping { } void FillFromScalar(Refs *refs, py::handle value) { + SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::FillFromScalar"); if (!dtype()) { throw std::invalid_argument( "The `fill` method is only valid for typed mappings but " @@ -242,6 +244,7 @@ class PyMapping { } void FillFromBuffer(py::handle buffer) { + SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::FillFromBuffer"); Py_buffer py_view; int flags = PyBUF_FORMAT | PyBUF_ND; // C-Contiguous ND. if (PyObject_GetBuffer(buffer.ptr(), &py_view, flags) != 0) { @@ -286,6 +289,7 @@ class PyMapping { } py::object GetItems(py::handle self_obj, Refs *refs) { + SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::GetItems"); if (!dtype()) { throw std::invalid_argument( "The `items` property is only valid for typed mappings but " @@ -306,6 +310,7 @@ class PyMapping { } void SetItems(Refs *refs, py::handle initializer) { + SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::SetItems"); if (!dtype()) { throw std::invalid_argument( "The `items` property is only valid for typed mappings but " @@ -410,6 +415,7 @@ void BindArray(py::module_ &m) { .def( "map", [](storage &self, bool read, bool write, bool discard) { + SHORTFIN_TRACE_SCOPE_NAMED("PyStorage::map"); int access = 0; if (read) access |= IREE_HAL_MEMORY_ACCESS_READ; if (write || discard) access |= IREE_HAL_MEMORY_ACCESS_WRITE; @@ -565,6 +571,7 @@ void BindArray(py::module_ &m) { .def( "map", [](device_array &self, bool read, bool write, bool discard) { + SHORTFIN_TRACE_SCOPE_NAMED("PyArray::map"); int access = 0; if (read) access |= IREE_HAL_MEMORY_ACCESS_READ; if (write || discard) access |= IREE_HAL_MEMORY_ACCESS_WRITE; @@ -586,6 +593,7 @@ void BindArray(py::module_ &m) { .def_prop_rw( "items", [refs](device_array &self) { + SHORTFIN_TRACE_SCOPE_NAMED("PyArray::items"); PyMapping *mapping; py::object mapping_obj = CreateMappingObject(&mapping); mapping->set_dtype(self.dtype()); @@ -606,6 +614,7 @@ void BindArray(py::module_ &m) { .def_prop_ro( "__array_interface__", [refs](device_array &self) { + SHORTFIN_TRACE_SCOPE_NAMED("PyArray::__array_interface__"); py::dict interface; interface["version"] = 3; interface["strides"] = py::none(); diff --git a/shortfin/python/array_host_ops.cc b/shortfin/python/array_host_ops.cc index 8c4af0070..8eaf4ddd8 100644 --- a/shortfin/python/array_host_ops.cc +++ b/shortfin/python/array_host_ops.cc @@ -95,6 +95,7 @@ void BindArrayHostOps(py::module_ &m) { "argmax", [](device_array &input, int axis, std::optional out, bool keepdims, bool device_visible) { + SHORTFIN_TRACE_SCOPE_NAMED("PyHostOp::argmax"); if (axis < 0) axis += input.shape().size(); if (axis < 0 || axis >= input.shape().size()) { throw std::invalid_argument( @@ -139,6 +140,7 @@ void BindArrayHostOps(py::module_ &m) { m.def( "fill_randn", [](device_array out, std::optional gen) { + SHORTFIN_TRACE_SCOPE_NAMED("PyHostOp::fill_randn"); if (!gen) gen = &PyRandomGenerator::get_default(); auto compute = [&]() { auto result = xt::random::randn(out.shape_container(), /*mean=*/0.0, diff --git a/shortfin/python/lib_ext.cc b/shortfin/python/lib_ext.cc index 0bfb67588..d17606b4b 100644 --- a/shortfin/python/lib_ext.cc +++ b/shortfin/python/lib_ext.cc @@ -173,6 +173,7 @@ class PyWorkerExtension : public local::Worker::Extension { py::handle loop() { return loop_; } void OnThreadStart() noexcept override { + SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::OnThreadStart"); // Python threading initialization. // If our own thread, teach Python about it. Not done for donated. if (worker().options().owned_thread) { @@ -187,6 +188,7 @@ class PyWorkerExtension : public local::Worker::Extension { } void OnThreadStop() noexcept override { + SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::OnThreadStop"); { // Do Python level thread cleanup. py::gil_scoped_acquire g; @@ -253,6 +255,7 @@ class PyProcess : public local::detail::BaseProcess { std::bind(&PyProcess::RunOnWorker, self_object)); } static void RunOnWorker(py::handle self_handle) { + SHORTFIN_TRACE_SCOPE_NAMED("PyProcess:RunOnWorker"); py::gil_scoped_acquire g; // Steal the reference back from ScheduleOnWorker. Important: this is // very likely the last reference to the process. So self must not be @@ -342,6 +345,7 @@ py::object PyRehydrateRef(local::ProgramInvocation *inv, py::object RunInForeground(std::shared_ptr refs, local::System &self, py::object coro) { + SHORTFIN_TRACE_SCOPE_NAMED("CoroRunInForeground"); bool is_main_thread = refs->threading_current_thread().is(refs->threading_main_thread()); @@ -936,6 +940,7 @@ void BindLocal(py::module_ &m) { callable.inc_ref(); // Stolen within the callback. auto thunk = +[](void *user_data, iree_loop_t loop, iree_status_t status) noexcept -> iree_status_t { + SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::Callback"); py::gil_scoped_acquire g; py::object user_callable = py::steal(static_cast(user_data)); @@ -955,6 +960,7 @@ void BindLocal(py::module_ &m) { callable.inc_ref(); // Stolen within the callback. auto thunk = +[](void *user_data, iree_loop_t loop, iree_status_t status) noexcept -> iree_status_t { + SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::DelayCallback"); py::gil_scoped_acquire g; py::object user_callable = py::steal(static_cast(user_data)); @@ -1030,6 +1036,7 @@ void BindLocal(py::module_ &m) { py::class_(m, "CompletionEvent") .def(py::init<>()) .def("__await__", [](py::handle self_obj) { + SHORTFIN_TRACE_SCOPE_NAMED("PyCompletionEvent::__await__"); auto &worker_ext = PyWorkerExtension::GetCurrent(); auto &self = py::cast(self_obj); py::object future = worker_ext.loop().attr("create_future")(); @@ -1051,6 +1058,7 @@ void BindLocal(py::module_ &m) { self, iree_infinite_timeout(), +[](void *future_vp, iree_loop_t loop, iree_status_t status) noexcept -> iree_status_t { + SHORTFIN_TRACE_SCOPE_NAMED("PyCompletionEvent::OnComplete"); py::gil_scoped_acquire g; py::object future = py::steal(static_cast(future_vp)); try { @@ -1145,6 +1153,7 @@ void BindLocal(py::module_ &m) { return py::none(); }) .def("__await__", [](py::handle self_obj) { + SHORTFIN_TRACE_SCOPE_NAMED("PyFuture::__await__"); // TODO: We should make our C++ future able to be used directly // vs needing to bridge it like this. auto &worker_ext = PyWorkerExtension::GetCurrent(); @@ -1166,6 +1175,7 @@ void BindLocal(py::module_ &m) { self.AddCallback( [py_future_vp = static_cast(future.release().ptr())]( local::Future &sf_future) { + SHORTFIN_TRACE_SCOPE_NAMED("PyFuture::OnComplete"); py::gil_scoped_acquire g; py::object py_future = py::steal(static_cast(py_future_vp)); diff --git a/shortfin/src/shortfin/array/array.cc b/shortfin/src/shortfin/array/array.cc index 11961b449..882e4ef39 100644 --- a/shortfin/src/shortfin/array/array.cc +++ b/shortfin/src/shortfin/array/array.cc @@ -64,6 +64,7 @@ mapping device_array::data_rw() { return storage_.map_read_write(); } mapping device_array::data_w() { return storage_.map_write_discard(); } std::optional device_array::map_memory_for_xtensor() { + SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::map_memory_for_xtensor"); if (storage_.is_mappable_for_read_write()) { return storage_.map_read_write(); } else if (storage_.is_mappable_for_read()) { @@ -97,6 +98,7 @@ std::string device_array::to_s() const { void device_array::AddAsInvocationArgument( local::ProgramInvocation *inv, local::ProgramResourceBarrier barrier) { + SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::AddAsInvocationArgument"); auto dims_span = shape(); iree_hal_buffer_view_t *buffer_view; SHORTFIN_THROW_IF_ERROR(iree_hal_buffer_view_create( @@ -117,6 +119,7 @@ iree_vm_ref_type_t device_array::invocation_marshalable_type() { device_array device_array::CreateFromInvocationResultRef( local::ProgramInvocation *inv, iree::vm_opaque_ref ref) { + SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::CreateFromInvocationResultRef"); // We don't retain the buffer view in the device array, so just deref it // vs stealing the ref. iree_hal_buffer_view_t *bv = iree_hal_buffer_view_deref(*ref.get()); diff --git a/shortfin/src/shortfin/array/storage.cc b/shortfin/src/shortfin/array/storage.cc index a30dbf450..ffbbd9ba2 100644 --- a/shortfin/src/shortfin/array/storage.cc +++ b/shortfin/src/shortfin/array/storage.cc @@ -43,6 +43,7 @@ storage storage::import_buffer(local::ScopedDevice &device, storage storage::allocate_device(ScopedDevice &device, iree_device_size_t allocation_size) { + SHORTFIN_TRACE_SCOPE_NAMED("storage::allocate_device"); if (!device.raw_device()) { throw std::invalid_argument("Cannot allocate with a null device affinity"); } @@ -63,6 +64,7 @@ storage storage::allocate_device(ScopedDevice &device, storage storage::allocate_host(ScopedDevice &device, iree_device_size_t allocation_size, bool device_visible) { + SHORTFIN_TRACE_SCOPE_NAMED("storage::allocate_host"); if (!device.raw_device()) { throw std::invalid_argument("Cannot allocate with a null device affinity"); } @@ -207,6 +209,7 @@ std::string storage::formatted_buffer_usage() const { void storage::AddAsInvocationArgument(local::ProgramInvocation *inv, local::ProgramResourceBarrier barrier) { + SHORTFIN_TRACE_SCOPE_NAMED("storage::AddAsInvocationArgument"); iree::vm_opaque_ref ref; *(&ref) = iree_hal_buffer_retain_ref(buffer_); inv->AddArg(std::move(ref)); @@ -220,6 +223,7 @@ iree_vm_ref_type_t storage::invocation_marshalable_type() { storage storage::CreateFromInvocationResultRef(local::ProgramInvocation *inv, iree::vm_opaque_ref ref) { + SHORTFIN_TRACE_SCOPE_NAMED("storage::CreateFromInvocationResultRef"); // Steal the ref to one of our smart pointers. // TODO: Should have an opaque_ref::release(). iree::hal_buffer_ptr buffer = @@ -230,6 +234,7 @@ storage storage::CreateFromInvocationResultRef(local::ProgramInvocation *inv, storage storage::ImportInvocationResultStorage(local::ProgramInvocation *inv, iree::hal_buffer_ptr buffer) { + SHORTFIN_TRACE_SCOPE_NAMED("storage::ImportInvocationResultStorage"); local::ScopedDevice device = local::ScopedDevice(*inv->fiber(), inv->device_selection()); auto imported_storage = storage::import_buffer(device, std::move(buffer)); @@ -251,6 +256,7 @@ storage storage::ImportInvocationResultStorage(local::ProgramInvocation *inv, void storage::AddInvocationArgBarrier(local::ProgramInvocation *inv, local::ProgramResourceBarrier barrier) { + SHORTFIN_TRACE_SCOPE_NAMED("storage::AddInvocationArgBarrier"); switch (barrier) { case ProgramResourceBarrier::DEFAULT: case ProgramResourceBarrier::READ: diff --git a/shortfin/src/shortfin/array/xtensor_bridge.cc b/shortfin/src/shortfin/array/xtensor_bridge.cc index bd3753331..da350b71a 100644 --- a/shortfin/src/shortfin/array/xtensor_bridge.cc +++ b/shortfin/src/shortfin/array/xtensor_bridge.cc @@ -8,6 +8,7 @@ #include +#include "shortfin/support/logging.h" #include "xtl/xhalf_float.hpp" namespace shortfin::array { @@ -56,6 +57,7 @@ class typed_xt_methods final : public poly_xt_methods { bool poly_xt_methods::inplace_new(uint8_t *inst_storage, DType dtype, void *array_memory, size_t array_memory_size, Dims &dims) { + SHORTFIN_TRACE_SCOPE_NAMED("array_xtensor_cast"); #define POLY_XT_CASE(et, cpp_type) \ case et: \ typed_xt_methods::concrete_inplace_new( \ diff --git a/shortfin/src/shortfin/local/program.cc b/shortfin/src/shortfin/local/program.cc index 3fd41d87b..6ab1f47ae 100644 --- a/shortfin/src/shortfin/local/program.cc +++ b/shortfin/src/shortfin/local/program.cc @@ -75,6 +75,7 @@ std::string_view ProgramFunction::calling_convention() const { ProgramInvocation::Ptr ProgramFunction::CreateInvocation( std::shared_ptr fiber, std::optional isolation) { + SHORTFIN_TRACE_SCOPE_NAMED("ProgramFunction::CreateInvocation"); ProgramIsolation actual_isolation = isolation ? *isolation : isolation_; // Low-overhead NONE isolation handling (saves some ref-count twiddling). if (actual_isolation == ProgramIsolation::NONE) { @@ -101,6 +102,7 @@ std::string ProgramFunction::to_s() const { ProgramModule ProgramModule::Load(System &system, const std::filesystem::path &path, bool mmap) { + SHORTFIN_TRACE_SCOPE_NAMED("ProgramModule::Load"); iree::file_contents_ptr contents; iree_file_read_flags_t flags = mmap ? IREE_FILE_READ_FLAG_MMAP : IREE_FILE_READ_FLAG_PRELOAD; @@ -171,6 +173,7 @@ std::vector ProgramModule::exports() const { Program Program::Load(std::span modules, Options &&options) { + SHORTFIN_TRACE_SCOPE_NAMED("Program::Load"); std::vector all_modules; std::vector raw_devices; @@ -451,6 +454,7 @@ iree_status_t ProgramInvocation::FinalizeCallingConvention( ProgramInvocation::Future ProgramInvocation::Invoke( ProgramInvocation::Ptr invocation) { + SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::Invoke"); invocation->CheckNotScheduled(); Worker &worker = invocation->fiber_->worker(); @@ -462,9 +466,11 @@ ProgramInvocation::Future ProgramInvocation::Invoke( iree_vm_function_t function, ProgramInvocationModel invocation_model, std::optional failure_future) { + SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::InvokeAsync"); auto complete_callback = [](void *user_data, iree_loop_t loop, iree_status_t status, iree_vm_list_t *outputs) noexcept -> iree_status_t { + SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::Complete"); // Async invocation helpfully gives us a retained reference to the // outputs, but we already have one statically on the // ProgramInvocation. So release this one, which makes it safe to @@ -620,6 +626,7 @@ StaticProgramParameters::StaticProgramParameters( void StaticProgramParameters::Load(std::filesystem::path file_path, LoadOptions options) { + SHORTFIN_TRACE_SCOPE_NAMED("StaticProgramParameters::Load"); // Default format from extension. if (options.format.empty()) { options.format = file_path.extension().string(); diff --git a/shortfin/src/shortfin/local/scheduler.cc b/shortfin/src/shortfin/local/scheduler.cc index 3b82ded20..883951a20 100644 --- a/shortfin/src/shortfin/local/scheduler.cc +++ b/shortfin/src/shortfin/local/scheduler.cc @@ -61,6 +61,7 @@ void Account::active_deps_extend(iree_hal_semaphore_list_t sem_list) { } VoidFuture Account::OnSync() { + SHORTFIN_TRACE_SCOPE_NAMED("Account::OnSync"); // TODO: Burn this path with fire! No attempt has been made to make this // particularly good: the backend is being implemented now to export // HAL semaphores via iree_hal_semaphore_await, and that should be used @@ -133,6 +134,7 @@ Scheduler::~Scheduler() { void Scheduler::Initialize( std::span> devices) { + SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::Initialize"); for (auto &it : devices) { accounts_.emplace_back(*this, it.second); } @@ -165,6 +167,7 @@ Account &Scheduler::GetDefaultAccount(ScopedDevice &device) { void Scheduler::AppendCommandBuffer(ScopedDevice &device, TransactionType tx_type, std::function callback) { + SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::AppendCommandBuffer"); Account &account = GetDefaultAccount(device); auto needed_affinity_bits = device.affinity().queue_affinity(); SHORTFIN_SCHED_LOG( @@ -242,6 +245,7 @@ void Scheduler::AppendCommandBuffer(ScopedDevice &device, } iree_status_t Scheduler::FlushWithStatus() noexcept { + SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::FlushWithStatus"); // This loop is optimized for a small number of accounts, where it is // fine to just linearly probe. If this ever becomes cumbersome, we can // maintain a dirty list which is appended to when an account transitions diff --git a/shortfin/src/shortfin/local/system.cc b/shortfin/src/shortfin/local/system.cc index f5012c626..ef31bb001 100644 --- a/shortfin/src/shortfin/local/system.cc +++ b/shortfin/src/shortfin/local/system.cc @@ -20,6 +20,7 @@ namespace shortfin::local { System::System(iree_allocator_t host_allocator) : host_allocator_(host_allocator) { + SHORTFIN_TRACE_SCOPE_NAMED("System::System"); logging::construct("System", this); SHORTFIN_THROW_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT, host_allocator_, @@ -29,6 +30,7 @@ System::System(iree_allocator_t host_allocator) } System::~System() { + SHORTFIN_TRACE_SCOPE_NAMED("System::~System"); logging::destruct("System", this); bool needs_shutdown = false; { @@ -61,6 +63,7 @@ System::~System() { } void System::Shutdown() { + SHORTFIN_TRACE_SCOPE_NAMED("System::Shutdown"); // Stop workers. std::vector local_workers; { diff --git a/shortfin/src/shortfin/local/systems/amdgpu.cc b/shortfin/src/shortfin/local/systems/amdgpu.cc index 78efad709..cecedd1a0 100644 --- a/shortfin/src/shortfin/local/systems/amdgpu.cc +++ b/shortfin/src/shortfin/local/systems/amdgpu.cc @@ -87,6 +87,7 @@ void AMDGPUSystemBuilder::InitializeDefaultSettings() { void AMDGPUSystemBuilder::Enumerate() { if (hip_hal_driver_) return; + SHORTFIN_TRACE_SCOPE_NAMED("AMDGPUSystemBuilder::Enumerate"); iree_hal_hip_driver_options_t driver_options; iree_hal_hip_driver_options_initialize(&driver_options); @@ -127,6 +128,7 @@ std::vector AMDGPUSystemBuilder::GetAvailableDeviceIds() { } SystemPtr AMDGPUSystemBuilder::CreateSystem() { + SHORTFIN_TRACE_SCOPE_NAMED("AMDGPUSystemBuilder::CreateSystem"); auto lsys = std::make_shared(host_allocator()); Enumerate(); diff --git a/shortfin/src/shortfin/local/systems/host.cc b/shortfin/src/shortfin/local/systems/host.cc index 440a3ff51..1da4b2af1 100644 --- a/shortfin/src/shortfin/local/systems/host.cc +++ b/shortfin/src/shortfin/local/systems/host.cc @@ -125,6 +125,7 @@ HostCPUSystemBuilder::SelectHostCPUNodesFromOptions() { } SystemPtr HostCPUSystemBuilder::CreateSystem() { + SHORTFIN_TRACE_SCOPE_NAMED("HostCPUSystemBuilder::CreateSystem"); auto lsys = std::make_shared(host_allocator()); // TODO: Real NUMA awareness. lsys->InitializeNodes(1); @@ -136,6 +137,7 @@ SystemPtr HostCPUSystemBuilder::CreateSystem() { } iree_hal_driver_t *HostCPUSystemBuilder::InitializeHostCPUDriver(System &lsys) { + SHORTFIN_TRACE_SCOPE_NAMED("HostCPUSystemBuilder::InitializeHostCPUDriver"); // TODO: Kill these flag variants in favor of settings on the config // object. SHORTFIN_THROW_IF_ERROR(iree_task_executor_options_initialize_from_flags( @@ -206,6 +208,7 @@ iree_hal_driver_t *HostCPUSystemBuilder::InitializeHostCPUDriver(System &lsys) { void HostCPUSystemBuilder::InitializeHostCPUDevices(System &lsys, iree_hal_driver_t *driver) { + SHORTFIN_TRACE_SCOPE_NAMED("HostCPUSystemBuilder::InitializeHostCPUDevices"); iree_host_size_t device_info_count = 0; iree::allocated_ptr device_infos(host_allocator()); SHORTFIN_THROW_IF_ERROR(iree_hal_driver_query_available_devices( diff --git a/shortfin/src/shortfin/local/worker.cc b/shortfin/src/shortfin/local/worker.cc index 09207e5e4..d5ffafdbe 100644 --- a/shortfin/src/shortfin/local/worker.cc +++ b/shortfin/src/shortfin/local/worker.cc @@ -109,6 +109,7 @@ iree_status_t Worker::TransactLoop(iree_status_t signal_status) { for (auto& next_thunk : next_thunks_) { // TODO: Make thunks have to return a status, propagate, and handle // exceptions. + SHORTFIN_TRACE_SCOPE_NAMED("Worker::ThreadsafeCallback"); next_thunk(); } next_thunks_.clear(); diff --git a/shortfin/src/shortfin/support/logging.h b/shortfin/src/shortfin/support/logging.h index 7bc9e130d..e70c54e99 100644 --- a/shortfin/src/shortfin/support/logging.h +++ b/shortfin/src/shortfin/support/logging.h @@ -23,6 +23,14 @@ #define SHORTFIN_SCHED_LOG(...) #endif +// Tracing macros. These are currently just aliases of the underlying IREE +// macros, but we maintain the ability to redirect them in the future (i.e. +// for certain kinds of library builds, etc). +#define SHORTFIN_TRACE_SCOPE IREE_TRACE_SCOPE +#define SHORTFIN_TRACE_SCOPE_NAMED(name_literal) \ + IREE_TRACE_SCOPE_NAMED(name_literal) +#define SHORTFIN_TRACE_SCOPE_ID IREE_TRACE_SCOPE_ID + namespace shortfin::logging { SHORTFIN_API void InitializeFromEnv();