Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dynamic Selection] Adding sycl profiling for auto tune policy #1464

Merged
merged 31 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
9417ddb
Added static assert for keyArgs==Args
AnuyaWelling2801 Jan 9, 2024
7e54e28
WIP for jit compilation
AnuyaWelling2801 Jan 10, 2024
268f0d5
Fixing Autotune bug
AnuyaWelling2801 Jan 17, 2024
0dda712
Added sycl profiling without host task
AnuyaWelling2801 Mar 21, 2024
a38327a
Corrections in dynamic traits
AnuyaWelling2801 Mar 22, 2024
70f993c
Remove comments in dynamic_selection_traits
AnuyaWelling2801 Mar 22, 2024
59fa9a8
Merged main
AnuyaWelling2801 Mar 22, 2024
69f5014
Adding back jit compolation restrictions
AnuyaWelling2801 Mar 22, 2024
757c470
Adressing comments to add thread safety, better traits
AnuyaWelling2801 Mar 27, 2024
37a895a
Added backend trait to check if profiling is enabled
AnuyaWelling2801 Mar 27, 2024
d9fe635
Backend traits, renaming and adding a lock to lazy report
AnuyaWelling2801 Apr 4, 2024
95ee3ac
Fixed memory leaks
AnuyaWelling2801 Apr 9, 2024
cd5b318
No nullptr for selection handle in async waiter constructor
AnuyaWelling2801 Apr 15, 2024
e58f264
Added an erase and remove_if
AnuyaWelling2801 Apr 15, 2024
e134770
Adressing comments for profiling report, Adding profiling to default …
AnuyaWelling2801 Apr 24, 2024
6ce3b5e
Changes to std::chrono::duration
AnuyaWelling2801 Apr 25, 2024
a8ad96b
Addressed comments for sycl backend
AnuyaWelling2801 Apr 25, 2024
b70aeec
Addressed comments to sycl backend
AnuyaWelling2801 Apr 26, 2024
60710e2
Fix USM memory leaks and create unique task names (#1537)
SergeyKopienko Apr 26, 2024
1e76278
Fix the format of report method - using `report_duration` in second p…
SergeyKopienko Apr 26, 2024
c217e1a
Fixing clang format
AnuyaWelling2801 Apr 26, 2024
75dd226
Adding header file for backend_traits
AnuyaWelling2801 Apr 26, 2024
bc95cd2
Changed structure for sycl_backend submit
AnuyaWelling2801 Apr 26, 2024
19e78e7
Fixed clang format
AnuyaWelling2801 Apr 26, 2024
3d3dab2
Changes to make variables is_profiling_enabled and aliases for report…
AnuyaWelling2801 Apr 29, 2024
8009669
Moved is_profiling_enabled outside the async_waiter class
AnuyaWelling2801 Apr 29, 2024
ae46b12
Making sycl backend submit function more readable
AnuyaWelling2801 Apr 29, 2024
0cf0daa
Fixed clang format
AnuyaWelling2801 Apr 29, 2024
5028e63
Simplify sycl_backend::submit(SelectionHandle s, Function&& f, Args&&…
SergeyKopienko Apr 30, 2024
aa07b29
Minor changes to auto_tune_policy.h and sycl_backend.h
AnuyaWelling2801 Apr 30, 2024
797ea84
Improving readability
AnuyaWelling2801 Apr 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
#include <mutex>
#include <utility>
#include <chrono>
#include <ratio>
#include <limits>
#include <vector>
#include <type_traits>
#include <tuple>
#include <unordered_map>
#include "oneapi/dpl/internal/dynamic_selection_traits.h"
#include "oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h"
#if _DS_BACKEND_SYCL != 0
# include "oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h"
#endif
Expand All @@ -47,6 +49,9 @@ class auto_tune_policy
using size_type = typename std::vector<typename Backend::resource_type>::size_type;
using timing_t = uint64_t;

using report_clock_type = std::chrono::steady_clock;
using report_duration = std::chrono::milliseconds;

SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
static constexpr timing_t never_resample = 0;
static constexpr size_type use_best_resource = ~size_type(0);

Expand All @@ -66,7 +71,7 @@ class auto_tune_policy
{
std::mutex m_;

std::chrono::steady_clock::time_point t0_;
report_clock_type::time_point t0_;

timing_t best_timing_ = std::numeric_limits<timing_t>::max();
resource_with_index_t best_resource_;
Expand All @@ -80,7 +85,7 @@ class auto_tune_policy
timing_t resample_time_ = 0.0;

tuner_t(resource_with_index_t br, size_type resources_size, timing_t rt)
: t0_(std::chrono::steady_clock::now()), best_resource_(br), max_resource_to_profile_(resources_size),
: t0_(report_clock_type::now()), best_resource_(br), max_resource_to_profile_(resources_size),
resample_time_(rt)
{
}
Expand All @@ -100,8 +105,8 @@ class auto_tune_policy
}
else
{
auto now = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - t0_).count();
const auto now = report_clock_type::now();
const auto ms = std::chrono::duration_cast<report_duration>(now - t0_).count();
if (ms < resample_time_)
{
return use_best_resource;
Expand Down Expand Up @@ -217,6 +222,9 @@ class auto_tune_policy
select(Function&& f, Args&&... args)
{
static_assert(sizeof...(KeyArgs) == sizeof...(Args));
if constexpr(backend_traits::lazy_report_v<Backend>){
backend_->lazy_report();
}
if (state_)
{
std::lock_guard<std::mutex> l(state_->m_);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Copyright (C) 2023 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef _ONEDPL_INTERNAL_BACKEND_TRAITS_H
#define _ONEDPL_INTERNAL_BACKEND_TRAITS_H

#include <utility>
#include <cstdint>
akukanov marked this conversation as resolved.
Show resolved Hide resolved
#include <type_traits>

namespace oneapi
{
namespace dpl
{
namespace experimental
{
namespace internal
{
template <typename Backend>
auto
has_lazy_report_impl(...) -> std::false_type;

template <typename Backend>
auto
has_lazy_report_impl(int) -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});

template <typename Backend>
struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
{
};

}//namespace internal

namespace backend_traits {
template <typename S>
struct lazy_report_value
{
static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report<S>::value;
};
template <typename S>
inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
akukanov marked this conversation as resolved.
Show resolved Hide resolved
akukanov marked this conversation as resolved.
Show resolved Hide resolved

} //namespace backend_traits

} // namespace experimental
} // namespace dpl
} // namespace oneapi

#endif /*_ONEDPL_INTERNAL_BACKEND_TRAITS_H*/
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ struct dynamic_load_policy
selection_type
select(Args&&...)
{
if constexpr(backend_traits::lazy_report_v<Backend>){
backend_->lazy_report();
}
if (state_)
{
std::lock_guard<std::mutex> l(state_->m_);
Expand Down
140 changes: 122 additions & 18 deletions include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
#include "oneapi/dpl/internal/dynamic_selection_impl/scoring_policy_defs.h"

#include <chrono>
#include <ratio>
#include <vector>
#include <memory>
#include <utility>
#include <algorithm>

namespace oneapi
{
Expand All @@ -35,25 +37,87 @@ class sycl_backend
using execution_resource_t = resource_type;
using resource_container_t = std::vector<execution_resource_t>;

using report_clock_type = std::chrono::steady_clock;
using report_duration = std::chrono::milliseconds;
akukanov marked this conversation as resolved.
Show resolved Hide resolved

static inline bool is_profiling_enabled = false;
private:
class async_waiter
class async_waiter_base{
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
public:
virtual void report() = 0;
virtual bool is_complete() = 0;
virtual ~async_waiter_base() = default;
};

template<typename Selection>
class async_waiter : public async_waiter_base
{
sycl::event e_;

std::shared_ptr<Selection> s;
public:
async_waiter(const sycl::event& e) : e_(e) {}
async_waiter() = default;
async_waiter(sycl::event e, std::shared_ptr<Selection> selection) : e_(e), s(selection) {}

sycl::event
unwrap()
{
return e_;
}

void
wait()
{
e_.wait();
}

void
report() override{
if constexpr (report_value_v<Selection, execution_info::task_time_t>){
if(s!=nullptr){
cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
akukanov marked this conversation as resolved.
Show resolved Hide resolved
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start);
s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(duration_in_ns).count());
}
}


}

bool
is_complete() override{
return e_.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete;
}

};

struct async_waiter_list_t{

std::mutex m_;
std::vector<std::unique_ptr<async_waiter_base>> async_waiters;

void add_waiter(async_waiter_base *t){
std::lock_guard<std::mutex> l(m_);
async_waiters.push_back(std::unique_ptr<async_waiter_base>(t));
}

void lazy_report(){
if(is_profiling_enabled){
std::lock_guard<std::mutex> l(m_);
async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::unique_ptr<async_waiter_base>& async_waiter){
if(async_waiter->is_complete()){
async_waiter->report();
return true;
}
return false;
}), async_waiters.end());
}
}
};

async_waiter_list_t async_waiter_list;


class submission_group
{
resource_container_t resources_;
Expand Down Expand Up @@ -85,11 +149,16 @@ class sycl_backend
template <typename NativeUniverseVector>
sycl_backend(const NativeUniverseVector& v)
{
bool profiling = true;
global_rank_.reserve(v.size());
for (auto e : v)
{
global_rank_.push_back(e);
if(!e.template has_property<sycl::property::queue::enable_profiling>()){
profiling = false;
}
}
is_profiling_enabled = profiling;
sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
}

Expand All @@ -105,28 +174,47 @@ class sycl_backend
if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t> ||
report_value_v<SelectionHandle, execution_info::task_time_t>)
{
std::chrono::steady_clock::time_point t0;
report_clock_type::time_point t0;
if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
{
t0 = std::chrono::steady_clock::now();
if (!is_profiling_enabled)
{
t0 = report_clock_type::now();
}
}
async_waiter<SelectionHandle> waiter;
auto e1 = f(q, std::forward<Args>(args)...);
auto e2 = q.submit([=](sycl::handler& h) {
h.depends_on(e1);
h.host_task([=]() {
if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)
{
if constexpr(report_info_v<SelectionHandle, execution_info::task_completion_t>){
auto e2 = q.submit([=](sycl::handler& h){
h.depends_on(e1);
h.host_task([=](){
s.report(execution_info::task_completion);
}
});
});
});
return async_waiter{e2};
waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
}
if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
if (is_profiling_enabled)
{
waiter = async_waiter{e1,std::make_shared<SelectionHandle>(s)};
async_waiter_list.add_waiter(new async_waiter(waiter));
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
}
else{
auto e2 = q.submit([=](sycl::handler& h){
h.depends_on(e1);
h.host_task([=](){
const auto tp_now = report_clock_type::now();
s.report(execution_info::task_time, std::chrono::duration_cast<report_duration>(tp_now - t0).count());
});
});
waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
}
}
return waiter;
}
else
{
return async_waiter{f(unwrap(s), std::forward<Args>(args)...)};
return async_waiter{f(unwrap(s), std::forward<Args>(args)...), std::make_shared<SelectionHandle>(s)};
akukanov marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand All @@ -142,17 +230,33 @@ class sycl_backend
return global_rank_;
}

void lazy_report(){
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is not thread safe. You might have concurrent accesses to the async_waiter array that change its size -- calls to lazy_report and submit (which calls add_waiter).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Created a new structure async_wait_arr.

async_waiter_list.lazy_report();
}

private:
resource_container_t global_rank_;
std::unique_ptr<submission_group> sgroup_ptr_;

void
initialize_default_resources()
{
bool profiling = true;
auto prop_list = sycl::property_list{};
auto devices = sycl::device::get_devices();
for (auto x : devices)
for (auto& x : devices)
{
if(!x.has(sycl::aspect::queue_profiling)){
profiling = false;
}
}
is_profiling_enabled = profiling;
if(is_profiling_enabled){
prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
}
for (auto& x : devices)
{
global_rank_.push_back(sycl::queue{x});
global_rank_.push_back(sycl::queue{x, prop_list});
}
}
};
Expand Down
Loading
Loading