Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[shortfin] Adds C++ tracing scopes. #480

Merged
merged 1 commit into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions shortfin/python/array_binding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "./lib_ext.h"
#include "./utils.h"
#include "shortfin/array/api.h"
#include "shortfin/support/logging.h"

using namespace shortfin::array;

Expand Down Expand Up @@ -223,6 +224,7 @@ class PyMapping {
}

void FillFromScalar(Refs *refs, py::handle value) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::FillFromScalar");
if (!dtype()) {
throw std::invalid_argument(
"The `fill` method is only valid for typed mappings but "
Expand All @@ -242,6 +244,7 @@ class PyMapping {
}

void FillFromBuffer(py::handle buffer) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::FillFromBuffer");
Py_buffer py_view;
int flags = PyBUF_FORMAT | PyBUF_ND; // C-Contiguous ND.
if (PyObject_GetBuffer(buffer.ptr(), &py_view, flags) != 0) {
Expand Down Expand Up @@ -286,6 +289,7 @@ class PyMapping {
}

py::object GetItems(py::handle self_obj, Refs *refs) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::GetItems");
if (!dtype()) {
throw std::invalid_argument(
"The `items` property is only valid for typed mappings but "
Expand All @@ -306,6 +310,7 @@ class PyMapping {
}

void SetItems(Refs *refs, py::handle initializer) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::SetItems");
if (!dtype()) {
throw std::invalid_argument(
"The `items` property is only valid for typed mappings but "
Expand Down Expand Up @@ -410,6 +415,7 @@ void BindArray(py::module_ &m) {
.def(
"map",
[](storage &self, bool read, bool write, bool discard) {
SHORTFIN_TRACE_SCOPE_NAMED("PyStorage::map");
int access = 0;
if (read) access |= IREE_HAL_MEMORY_ACCESS_READ;
if (write || discard) access |= IREE_HAL_MEMORY_ACCESS_WRITE;
Expand Down Expand Up @@ -565,6 +571,7 @@ void BindArray(py::module_ &m) {
.def(
"map",
[](device_array &self, bool read, bool write, bool discard) {
SHORTFIN_TRACE_SCOPE_NAMED("PyArray::map");
int access = 0;
if (read) access |= IREE_HAL_MEMORY_ACCESS_READ;
if (write || discard) access |= IREE_HAL_MEMORY_ACCESS_WRITE;
Expand All @@ -586,6 +593,7 @@ void BindArray(py::module_ &m) {
.def_prop_rw(
"items",
[refs](device_array &self) {
SHORTFIN_TRACE_SCOPE_NAMED("PyArray::items");
PyMapping *mapping;
py::object mapping_obj = CreateMappingObject(&mapping);
mapping->set_dtype(self.dtype());
Expand All @@ -606,6 +614,7 @@ void BindArray(py::module_ &m) {
.def_prop_ro(
"__array_interface__",
[refs](device_array &self) {
SHORTFIN_TRACE_SCOPE_NAMED("PyArray::__array_interface__");
py::dict interface;
interface["version"] = 3;
interface["strides"] = py::none();
Expand Down
2 changes: 2 additions & 0 deletions shortfin/python/array_host_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ void BindArrayHostOps(py::module_ &m) {
"argmax",
[](device_array &input, int axis, std::optional<device_array> out,
bool keepdims, bool device_visible) {
SHORTFIN_TRACE_SCOPE_NAMED("PyHostOp::argmax");
if (axis < 0) axis += input.shape().size();
if (axis < 0 || axis >= input.shape().size()) {
throw std::invalid_argument(
Expand Down Expand Up @@ -139,6 +140,7 @@ void BindArrayHostOps(py::module_ &m) {
m.def(
"fill_randn",
[](device_array out, std::optional<PyRandomGenerator *> gen) {
SHORTFIN_TRACE_SCOPE_NAMED("PyHostOp::fill_randn");
if (!gen) gen = &PyRandomGenerator::get_default();
auto compute = [&]<typename EltTy>() {
auto result = xt::random::randn(out.shape_container(), /*mean=*/0.0,
Expand Down
10 changes: 10 additions & 0 deletions shortfin/python/lib_ext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ class PyWorkerExtension : public local::Worker::Extension {
py::handle loop() { return loop_; }

void OnThreadStart() noexcept override {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::OnThreadStart");
// Python threading initialization.
// If our own thread, teach Python about it. Not done for donated.
if (worker().options().owned_thread) {
Expand All @@ -187,6 +188,7 @@ class PyWorkerExtension : public local::Worker::Extension {
}

void OnThreadStop() noexcept override {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::OnThreadStop");
{
// Do Python level thread cleanup.
py::gil_scoped_acquire g;
Expand Down Expand Up @@ -253,6 +255,7 @@ class PyProcess : public local::detail::BaseProcess {
std::bind(&PyProcess::RunOnWorker, self_object));
}
static void RunOnWorker(py::handle self_handle) {
SHORTFIN_TRACE_SCOPE_NAMED("PyProcess:RunOnWorker");
py::gil_scoped_acquire g;
// Steal the reference back from ScheduleOnWorker. Important: this is
// very likely the last reference to the process. So self must not be
Expand Down Expand Up @@ -342,6 +345,7 @@ py::object PyRehydrateRef(local::ProgramInvocation *inv,

py::object RunInForeground(std::shared_ptr<Refs> refs, local::System &self,
py::object coro) {
SHORTFIN_TRACE_SCOPE_NAMED("CoroRunInForeground");
bool is_main_thread =
refs->threading_current_thread().is(refs->threading_main_thread());

Expand Down Expand Up @@ -936,6 +940,7 @@ void BindLocal(py::module_ &m) {
callable.inc_ref(); // Stolen within the callback.
auto thunk = +[](void *user_data, iree_loop_t loop,
iree_status_t status) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::Callback");
py::gil_scoped_acquire g;
py::object user_callable =
py::steal(static_cast<PyObject *>(user_data));
Expand All @@ -955,6 +960,7 @@ void BindLocal(py::module_ &m) {
callable.inc_ref(); // Stolen within the callback.
auto thunk = +[](void *user_data, iree_loop_t loop,
iree_status_t status) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::DelayCallback");
py::gil_scoped_acquire g;
py::object user_callable =
py::steal(static_cast<PyObject *>(user_data));
Expand Down Expand Up @@ -1030,6 +1036,7 @@ void BindLocal(py::module_ &m) {
py::class_<local::CompletionEvent>(m, "CompletionEvent")
.def(py::init<>())
.def("__await__", [](py::handle self_obj) {
SHORTFIN_TRACE_SCOPE_NAMED("PyCompletionEvent::__await__");
auto &worker_ext = PyWorkerExtension::GetCurrent();
auto &self = py::cast<local::CompletionEvent &>(self_obj);
py::object future = worker_ext.loop().attr("create_future")();
Expand All @@ -1051,6 +1058,7 @@ void BindLocal(py::module_ &m) {
self, iree_infinite_timeout(),
+[](void *future_vp, iree_loop_t loop,
iree_status_t status) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("PyCompletionEvent::OnComplete");
py::gil_scoped_acquire g;
py::object future = py::steal(static_cast<PyObject *>(future_vp));
try {
Expand Down Expand Up @@ -1145,6 +1153,7 @@ void BindLocal(py::module_ &m) {
return py::none();
})
.def("__await__", [](py::handle self_obj) {
SHORTFIN_TRACE_SCOPE_NAMED("PyFuture::__await__");
// TODO: We should make our C++ future able to be used directly
// vs needing to bridge it like this.
auto &worker_ext = PyWorkerExtension::GetCurrent();
Expand All @@ -1166,6 +1175,7 @@ void BindLocal(py::module_ &m) {
self.AddCallback(
[py_future_vp = static_cast<void *>(future.release().ptr())](
local::Future &sf_future) {
SHORTFIN_TRACE_SCOPE_NAMED("PyFuture::OnComplete");
py::gil_scoped_acquire g;
py::object py_future =
py::steal(static_cast<PyObject *>(py_future_vp));
Expand Down
3 changes: 3 additions & 0 deletions shortfin/src/shortfin/array/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ mapping device_array::data_rw() { return storage_.map_read_write(); }
mapping device_array::data_w() { return storage_.map_write_discard(); }

std::optional<mapping> device_array::map_memory_for_xtensor() {
SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::map_memory_for_xtensor");
if (storage_.is_mappable_for_read_write()) {
return storage_.map_read_write();
} else if (storage_.is_mappable_for_read()) {
Expand Down Expand Up @@ -97,6 +98,7 @@ std::string device_array::to_s() const {

void device_array::AddAsInvocationArgument(
local::ProgramInvocation *inv, local::ProgramResourceBarrier barrier) {
SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::AddAsInvocationArgument");
auto dims_span = shape();
iree_hal_buffer_view_t *buffer_view;
SHORTFIN_THROW_IF_ERROR(iree_hal_buffer_view_create(
Expand All @@ -117,6 +119,7 @@ iree_vm_ref_type_t device_array::invocation_marshalable_type() {

device_array device_array::CreateFromInvocationResultRef(
local::ProgramInvocation *inv, iree::vm_opaque_ref ref) {
SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::CreateFromInvocationResultRef");
// We don't retain the buffer view in the device array, so just deref it
// vs stealing the ref.
iree_hal_buffer_view_t *bv = iree_hal_buffer_view_deref(*ref.get());
Expand Down
6 changes: 6 additions & 0 deletions shortfin/src/shortfin/array/storage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ storage storage::import_buffer(local::ScopedDevice &device,

storage storage::allocate_device(ScopedDevice &device,
iree_device_size_t allocation_size) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::allocate_device");
if (!device.raw_device()) {
throw std::invalid_argument("Cannot allocate with a null device affinity");
}
Expand All @@ -63,6 +64,7 @@ storage storage::allocate_device(ScopedDevice &device,
storage storage::allocate_host(ScopedDevice &device,
iree_device_size_t allocation_size,
bool device_visible) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::allocate_host");
if (!device.raw_device()) {
throw std::invalid_argument("Cannot allocate with a null device affinity");
}
Expand Down Expand Up @@ -207,6 +209,7 @@ std::string storage::formatted_buffer_usage() const {

void storage::AddAsInvocationArgument(local::ProgramInvocation *inv,
local::ProgramResourceBarrier barrier) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::AddAsInvocationArgument");
iree::vm_opaque_ref ref;
*(&ref) = iree_hal_buffer_retain_ref(buffer_);
inv->AddArg(std::move(ref));
Expand All @@ -220,6 +223,7 @@ iree_vm_ref_type_t storage::invocation_marshalable_type() {

storage storage::CreateFromInvocationResultRef(local::ProgramInvocation *inv,
iree::vm_opaque_ref ref) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::CreateFromInvocationResultRef");
// Steal the ref to one of our smart pointers.
// TODO: Should have an opaque_ref::release().
iree::hal_buffer_ptr buffer =
Expand All @@ -230,6 +234,7 @@ storage storage::CreateFromInvocationResultRef(local::ProgramInvocation *inv,

storage storage::ImportInvocationResultStorage(local::ProgramInvocation *inv,
iree::hal_buffer_ptr buffer) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::ImportInvocationResultStorage");
local::ScopedDevice device =
local::ScopedDevice(*inv->fiber(), inv->device_selection());
auto imported_storage = storage::import_buffer(device, std::move(buffer));
Expand All @@ -251,6 +256,7 @@ storage storage::ImportInvocationResultStorage(local::ProgramInvocation *inv,

void storage::AddInvocationArgBarrier(local::ProgramInvocation *inv,
local::ProgramResourceBarrier barrier) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::AddInvocationArgBarrier");
switch (barrier) {
case ProgramResourceBarrier::DEFAULT:
case ProgramResourceBarrier::READ:
Expand Down
2 changes: 2 additions & 0 deletions shortfin/src/shortfin/array/xtensor_bridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <sstream>

#include "shortfin/support/logging.h"
#include "xtl/xhalf_float.hpp"

namespace shortfin::array {
Expand Down Expand Up @@ -56,6 +57,7 @@ class typed_xt_methods final : public poly_xt_methods {
bool poly_xt_methods::inplace_new(uint8_t *inst_storage, DType dtype,
void *array_memory, size_t array_memory_size,
Dims &dims) {
SHORTFIN_TRACE_SCOPE_NAMED("array_xtensor_cast");
#define POLY_XT_CASE(et, cpp_type) \
case et: \
typed_xt_methods<cpp_type>::concrete_inplace_new( \
Expand Down
7 changes: 7 additions & 0 deletions shortfin/src/shortfin/local/program.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ std::string_view ProgramFunction::calling_convention() const {

ProgramInvocation::Ptr ProgramFunction::CreateInvocation(
std::shared_ptr<Fiber> fiber, std::optional<ProgramIsolation> isolation) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramFunction::CreateInvocation");
ProgramIsolation actual_isolation = isolation ? *isolation : isolation_;
// Low-overhead NONE isolation handling (saves some ref-count twiddling).
if (actual_isolation == ProgramIsolation::NONE) {
Expand All @@ -101,6 +102,7 @@ std::string ProgramFunction::to_s() const {
ProgramModule ProgramModule::Load(System &system,
const std::filesystem::path &path,
bool mmap) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramModule::Load");
iree::file_contents_ptr contents;
iree_file_read_flags_t flags =
mmap ? IREE_FILE_READ_FLAG_MMAP : IREE_FILE_READ_FLAG_PRELOAD;
Expand Down Expand Up @@ -171,6 +173,7 @@ std::vector<std::string> ProgramModule::exports() const {

Program Program::Load(std::span<const ProgramModule> modules,
Options &&options) {
SHORTFIN_TRACE_SCOPE_NAMED("Program::Load");
std::vector<iree_vm_module_t *> all_modules;
std::vector<iree_hal_device_t *> raw_devices;

Expand Down Expand Up @@ -451,6 +454,7 @@ iree_status_t ProgramInvocation::FinalizeCallingConvention(

ProgramInvocation::Future ProgramInvocation::Invoke(
ProgramInvocation::Ptr invocation) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::Invoke");
invocation->CheckNotScheduled();

Worker &worker = invocation->fiber_->worker();
Expand All @@ -462,9 +466,11 @@ ProgramInvocation::Future ProgramInvocation::Invoke(
iree_vm_function_t function,
ProgramInvocationModel invocation_model,
std::optional<ProgramInvocation::Future> failure_future) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::InvokeAsync");
auto complete_callback =
[](void *user_data, iree_loop_t loop, iree_status_t status,
iree_vm_list_t *outputs) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::Complete");
// Async invocation helpfully gives us a retained reference to the
// outputs, but we already have one statically on the
// ProgramInvocation. So release this one, which makes it safe to
Expand Down Expand Up @@ -620,6 +626,7 @@ StaticProgramParameters::StaticProgramParameters(

void StaticProgramParameters::Load(std::filesystem::path file_path,
LoadOptions options) {
SHORTFIN_TRACE_SCOPE_NAMED("StaticProgramParameters::Load");
// Default format from extension.
if (options.format.empty()) {
options.format = file_path.extension().string();
Expand Down
4 changes: 4 additions & 0 deletions shortfin/src/shortfin/local/scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ void Account::active_deps_extend(iree_hal_semaphore_list_t sem_list) {
}

VoidFuture Account::OnSync() {
SHORTFIN_TRACE_SCOPE_NAMED("Account::OnSync");
// TODO: Burn this path with fire! No attempt has been made to make this
// particularly good: the backend is being implemented now to export
// HAL semaphores via iree_hal_semaphore_await, and that should be used
Expand Down Expand Up @@ -133,6 +134,7 @@ Scheduler::~Scheduler() {

void Scheduler::Initialize(
std::span<const std::pair<std::string_view, Device *>> devices) {
SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::Initialize");
for (auto &it : devices) {
accounts_.emplace_back(*this, it.second);
}
Expand Down Expand Up @@ -165,6 +167,7 @@ Account &Scheduler::GetDefaultAccount(ScopedDevice &device) {
void Scheduler::AppendCommandBuffer(ScopedDevice &device,
TransactionType tx_type,
std::function<void(Account &)> callback) {
SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::AppendCommandBuffer");
Account &account = GetDefaultAccount(device);
auto needed_affinity_bits = device.affinity().queue_affinity();
SHORTFIN_SCHED_LOG(
Expand Down Expand Up @@ -242,6 +245,7 @@ void Scheduler::AppendCommandBuffer(ScopedDevice &device,
}

iree_status_t Scheduler::FlushWithStatus() noexcept {
SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::FlushWithStatus");
// This loop is optimized for a small number of accounts, where it is
// fine to just linearly probe. If this ever becomes cumbersome, we can
// maintain a dirty list which is appended to when an account transitions
Expand Down
3 changes: 3 additions & 0 deletions shortfin/src/shortfin/local/system.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ namespace shortfin::local {

System::System(iree_allocator_t host_allocator)
: host_allocator_(host_allocator) {
SHORTFIN_TRACE_SCOPE_NAMED("System::System");
logging::construct("System", this);
SHORTFIN_THROW_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT,
host_allocator_,
Expand All @@ -29,6 +30,7 @@ System::System(iree_allocator_t host_allocator)
}

System::~System() {
SHORTFIN_TRACE_SCOPE_NAMED("System::~System");
logging::destruct("System", this);
bool needs_shutdown = false;
{
Expand Down Expand Up @@ -61,6 +63,7 @@ System::~System() {
}

void System::Shutdown() {
SHORTFIN_TRACE_SCOPE_NAMED("System::Shutdown");
// Stop workers.
std::vector<Worker *> local_workers;
{
Expand Down
2 changes: 2 additions & 0 deletions shortfin/src/shortfin/local/systems/amdgpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ void AMDGPUSystemBuilder::InitializeDefaultSettings() {

void AMDGPUSystemBuilder::Enumerate() {
if (hip_hal_driver_) return;
SHORTFIN_TRACE_SCOPE_NAMED("AMDGPUSystemBuilder::Enumerate");

iree_hal_hip_driver_options_t driver_options;
iree_hal_hip_driver_options_initialize(&driver_options);
Expand Down Expand Up @@ -127,6 +128,7 @@ std::vector<std::string> AMDGPUSystemBuilder::GetAvailableDeviceIds() {
}

SystemPtr AMDGPUSystemBuilder::CreateSystem() {
SHORTFIN_TRACE_SCOPE_NAMED("AMDGPUSystemBuilder::CreateSystem");
auto lsys = std::make_shared<System>(host_allocator());
Enumerate();

Expand Down
Loading
Loading