From 9417ddb0b5770a206d411d0eac5a24b0ffa08fdd Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Tue, 9 Jan 2024 15:15:30 -0600 Subject: [PATCH 01/30] Added static assert for keyArgs==Args --- .../dpl/internal/dynamic_selection_impl/auto_tune_policy.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index c6b247e1abd..c5f95c83013 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -241,6 +241,7 @@ class auto_tune_policy auto submit(selection_type e, Function&& f, Args&&... args) { + static_assert(sizeof...(KeyArgs) == sizeof...(Args)); if (backend_) { return backend_->submit(e, std::forward(f), std::forward(args)...); From 7e54e28c41b1463f0226f6b0ef38f83398e0624b Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Wed, 10 Jan 2024 13:23:21 -0600 Subject: [PATCH 02/30] WIP for jit compilation --- .../dynamic_selection_impl/auto_tune_policy.h | 3 + .../dynamic_selection_impl/sycl_backend.h | 20 +- ...est_auto_tune_policy_sycl_profile.pass.cpp | 536 ++++++++++++++++++ 3 files changed, 557 insertions(+), 2 deletions(-) create mode 100644 test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index c5f95c83013..ac49493fd84 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -124,6 +124,7 @@ class auto_tune_policy { // ignore the 1st timing to cover for JIT compilation time_by_index_[index] = time_data_t{0, std::numeric_limits::max()}; + std::cout<<"This timing is ignored\n"; } else { @@ -132,6 +133,7 @@ class auto_tune_policy new_value = (n * td.value_ + t) / (n + 1); td.num_timings_ = n + 1; td.value_ = new_value; + } if (new_value < best_timing_) { @@ -221,6 +223,7 @@ class auto_tune_policy auto k = make_task_key(std::forward(f), std::forward(args)...); auto t = state_->tuner_by_key_[k]; auto index = t->get_resource_to_profile(); + std::cout<<"Device used : "<best_resource_, t}; diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 62746edf0d8..dd06775b310 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -106,9 +106,13 @@ class sycl_backend report_value_v) { std::chrono::steady_clock::time_point t0; + bool use_event_profiling = q.template has_property(); if constexpr (report_value_v) { - t0 = std::chrono::steady_clock::now(); + if (!use_event_profiling) + { + t0 = std::chrono::steady_clock::now(); + } } auto e1 = f(q, std::forward(args)...); auto e2 = q.submit([=](sycl::handler& h) { @@ -119,7 +123,19 @@ class sycl_backend s.report(execution_info::task_completion); } if constexpr (report_value_v) - s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count()); + { + if (use_event_profiling) + { + cl_ulong time_start = e1.template get_profiling_info(); + cl_ulong time_end = e1.template get_profiling_info(); + std::cout<<"Total time : "< +#include +#include "oneapi/dpl/dynamic_selection" +#include "support/test_dynamic_selection_utils.h" +#include "support/test_config.h" +#if TEST_DYNAMIC_SELECTION_AVAILABLE +# include "support/sycl_sanity.h" + +template +int +test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) +{ + using my_policy_t = Policy; + + int j; + std::vector v(1000000, 0.0); + + my_policy_t p{u}; + auto n_samples = u.size(); + + const int N = 10; + std::atomic ecount = 0; + bool pass = true; + + for (int i = 1; i <= N; ++i) + { + if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource) + { + j = 100; + } + else + { + j = 0; + } + const size_t bytes = 1000000 * sizeof(double); + if constexpr (call_select_before_submit) + { + auto f = [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { + if (i <= 2 * n_samples) + { + // we should be round-robining through the resources + if (q != u[(i - 1) % n_samples]) + { + std::cout << i << ": mismatch during rr phase\n" << std::flush; + pass = false; + } + } + else + { + if (q != u[best_resource]) + { + std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; + pass = false; + } + } + ecount += i; + if (j == 0) + { + return q.submit([&](sycl::handler& h) + { + h.single_task([=](){}); + }); + } + else + { + return q.submit([&](sycl::handler& h) { + double *d_v = sycl::malloc_device(1000000, q); + q.memcpy(d_v, v.data(), bytes).wait(); + q.parallel_for( + 1000000, [=](sycl::id<1> idx) { + for (int j0 = 0; j0 < j; ++j0) + { + d_v[idx] += idx; + } + }); + q.memcpy(v.data(), d_v, bytes).wait(); + }); + } + }; + auto s = oneapi::dpl::experimental::select(p, f); + auto e = oneapi::dpl::experimental::submit(s, f); + oneapi::dpl::experimental::wait(e); + } + else + { + // it's ok to capture by reference since we are waiting on each call + auto s = oneapi::dpl::experimental::submit( + p, [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { + if (i <= 2 * n_samples) + { + // we should be round-robining through the resources + if (q != u[(i - 1) % n_samples]) + { + std::cout << i << ": mismatch during rr phase\n" << std::flush; + pass = false; + } + } + else + { + if (q != u[best_resource]) + { + std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; + pass = false; + } + } + ecount += i; + if (j == 0) + { + return q.submit([&](sycl::handler& h){ + h.single_task([=](){}); + }); + } + else + { + return q.submit([&](sycl::handler& h) { + double *d_v = sycl::malloc_device(1000000, q); + q.memcpy(d_v, v.data(), bytes).wait(); + q.parallel_for( + 1000000, [=](sycl::id<1> idx) { + for (int j0 = 0; j0 < j; ++j0) + { + d_v[idx] += idx; + } + }); + q.memcpy(v.data(), d_v, bytes).wait(); + }); + } + }); + oneapi::dpl::experimental::wait(s); + } + + int count = ecount.load(); + if (count != i * (i + 1) / 2) + { + std::cout << "ERROR: scheduler did not execute all tasks exactly once\n"; + return 1; + } + } + if (!pass) + { + std::cout << "ERROR: did not select expected resources\n"; + return 1; + } + if constexpr (call_select_before_submit) + { + std::cout << "select then submit and wait on event: OK\n"; + } + else + { + std::cout << "submit and wait on event: OK\n"; + } + return 0; +} + +template +int +test_auto_submit_wait_on_group(UniverseContainer u, int best_resource) +{ + using my_policy_t = Policy; + + int j; + std::vector v(1000000, 0.0); + + my_policy_t p{u}; + auto n_samples = u.size(); + + const int N = 10; + std::atomic ecount = 0; + bool pass = true; + + for (int i = 1; i <= N; ++i) + { + if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource) + { + j = 100; + } + else + { + j = 0; + } + const size_t bytes = 1000000 * sizeof(double); + if constexpr (call_select_before_submit) + { + auto f = [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { + if (i <= 2 * n_samples) + { + // we should be round-robining through the resources + if (q != u[(i - 1) % n_samples]) + { + std::cout << i << ": mismatch during rr phase\n" << std::flush; + pass = false; + } + } + else + { + if (q != u[best_resource]) + { + std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; + pass = false; + } + } + ecount += i; + if (j == 0) + { + return q.submit([=](sycl::handler& h){ + h.single_task([=](){}); + }); + } + else + { + return q.submit([&](sycl::handler& h) { + double *d_v = sycl::malloc_device(1000000, q); + q.memcpy(d_v, v.data(), bytes).wait(); + h.parallel_for( + 1000000, [=](sycl::id<1> idx) { + for (int j0 = 0; j0 < j; ++j0) + { + d_v[idx] += idx; + } + }); + q.memcpy(v.data(), d_v, bytes).wait(); + }); + } + }; + auto s = oneapi::dpl::experimental::select(p, f); + auto e = oneapi::dpl::experimental::submit(s, f); + oneapi::dpl::experimental::wait(p.get_submission_group()); + } + else + { + // it's ok to capture by reference since we are waiting on each call + auto s = oneapi::dpl::experimental::submit( + p, [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { + if (i <= 2 * n_samples) + { + // we should be round-robining through the resources + if (q != u[(i - 1) % n_samples]) + { + std::cout << i << ": mismatch during rr phase\n" << std::flush; + pass = false; + } + } + else + { + if (q != u[best_resource]) + { + std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; + pass = false; + } + } + ecount += i; + if (j == 0) + { + return q.submit([=](sycl::handler& h){ + h.single_task([=](){}); + }); + } + else + { + return q.submit([&](sycl::handler& h) { + double *d_v = sycl::malloc_device(1000000, q); + q.memcpy(d_v, v.data(), bytes).wait(); + h.parallel_for( + 1000000, [=](sycl::id<1> idx) { + for (int j0 = 0; j0 < j; ++j0) + { + d_v[idx] += idx; + } + }); + q.memcpy(v.data(), d_v, bytes).wait(); + }); + } + }); + oneapi::dpl::experimental::wait(p.get_submission_group()); + } + + int count = ecount.load(); + if (count != i * (i + 1) / 2) + { + std::cout << "ERROR: scheduler did not execute all tasks exactly once\n"; + return 1; + } + } + if (!pass) + { + std::cout << "ERROR: did not select expected resources\n"; + return 1; + } + if constexpr (call_select_before_submit) + { + std::cout << "select then submit and wait on group: OK\n"; + } + else + { + std::cout << "submit and wait on group: OK\n"; + } + return 0; +} + +template +int +test_auto_submit_and_wait(UniverseContainer u, int best_resource) +{ + using my_policy_t = Policy; + + // they are cpus so this is ok + int j; + std::vector v(1000000, 0.0); + + my_policy_t p{u}; + auto n_samples = u.size(); + + const int N = 10; + std::atomic ecount = 0; + bool pass = true; + + for (int i = 1; i <= N; ++i) + { + if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource) + { + j = 100; + } + else + { + j = 0; + } + const size_t bytes = 1000000 * sizeof(double); + if constexpr (call_select_before_submit) + { + auto f = [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { + if (i <= 2 * n_samples) + { + // we should be round-robining through the resources + if (q != u[(i - 1) % n_samples]) + { + std::cout << i << ": mismatch during rr phase\n" << std::flush; + pass = false; + } + } + else + { + if (q != u[best_resource]) + { + std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; + pass = false; + } + } + ecount += i; + if (j == 0) + { + return q.submit([=](sycl::handler& h){ + h.single_task([=](){}); + }); + } + else + { + return q.submit([&](sycl::handler& h) { + double *d_v = sycl::malloc_device(1000000, q); + q.memcpy(d_v, v.data(), bytes).wait(); + h.parallel_for( + 1000000, [=](sycl::id<1> idx) { + for (int j0 = 0; j0 < j; ++j0) + { + d_v[idx] += idx; + } + }); + q.memcpy(v.data(), d_v, bytes).wait(); + }); + } + }; + auto s = oneapi::dpl::experimental::select(p, f); + oneapi::dpl::experimental::submit_and_wait(s, f); + } + else + { + // it's ok to capture by reference since we are waiting on each call + oneapi::dpl::experimental::submit_and_wait( + p, [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { + if (i <= 2 * n_samples) + { + // we should be round-robining through the resources + if (q != u[(i - 1) % n_samples]) + { + std::cout << i << ": mismatch during rr phase\n" << std::flush; + pass = false; + } + } + else + { + if (q != u[best_resource]) + { + std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; + pass = false; + } + } + ecount += i; + if (j == 0) + { + return q.submit([=](sycl::handler& h){ + h.single_task([=](){}); + }); + } + else + { + return q.submit([&](sycl::handler& h) { + double *d_v = sycl::malloc_device(1000000, q); + q.memcpy(d_v, v.data(), bytes).wait(); + h.parallel_for( + 1000000, [=](sycl::id<1> idx) { + for (int j0 = 0; j0 < j; ++j0) + { + d_v[idx] += idx; + } + }); + q.memcpy(v.data(), d_v, bytes).wait(); + }); + } + }); + } + + int count = ecount.load(); + if (count != i * (i + 1) / 2) + { + std::cout << "ERROR: scheduler did not execute all tasks exactly once\n"; + return 1; + } + } + if (!pass) + { + std::cout << "ERROR: did not select expected resources\n"; + return 1; + } + if constexpr (call_select_before_submit) + { + std::cout << "select then submit_and_wait: OK\n"; + } + else + { + std::cout << "submit_and_wait: OK\n"; + } + return 0; +} + +static inline void +build_auto_tune_universe1(std::vector& u) +{ + auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; + try + { + auto device_cpu = sycl::device(sycl::cpu_selector_v); + sycl::queue cpu_queue{device_cpu, prop_list}; + run_sycl_sanity_test(cpu_queue); + u.push_back(cpu_queue); + } + catch (const sycl::exception&) + { + std::cout << "SKIPPED: Unable to run with cpu_selector\n"; + } + try + { + auto device_gpu = sycl::device(sycl::gpu_selector_v); + sycl::queue gpu_queue{device_gpu, prop_list}; + run_sycl_sanity_test(gpu_queue); + u.push_back(gpu_queue); + } + catch (const sycl::exception&) + { + std::cout << "SKIPPED: Unable to run with gpu_selector\n"; + } +} + +#endif + +int +main() +{ +#if TEST_DYNAMIC_SELECTION_AVAILABLE + using policy_t = oneapi::dpl::experimental::auto_tune_policy; + std::vector u; + build_auto_tune_universe1(u); + + //If building the universe is not a success, return + if (u.size() == 0 || u.size()==0) + return 0; + + constexpr bool just_call_submit = false; + constexpr bool call_select_before_submit = true; + + if (test_auto_submit_wait_on_event(u, 0) || + test_auto_submit_wait_on_event(u, 1) || + test_auto_submit_wait_on_event(u, 0) || + test_auto_submit_wait_on_event(u, 1) || + test_auto_submit_wait_on_group(u, 0) || + test_auto_submit_wait_on_group(u, 1) || + test_auto_submit_wait_on_group(u, 0) || + test_auto_submit_wait_on_group(u, 1) || + test_auto_submit_and_wait(u, 0) || + test_auto_submit_and_wait(u, 1)|| + test_auto_submit_and_wait(u, 0) || + test_auto_submit_and_wait(u, 1) || + // now select then submits + test_auto_submit_wait_on_event(u, 0) || + test_auto_submit_wait_on_event(u, 1) || + test_auto_submit_wait_on_event(u, 0) || + test_auto_submit_wait_on_event(u, 1) || + test_auto_submit_wait_on_group(u, 0) || + test_auto_submit_wait_on_group(u, 1) || + test_auto_submit_wait_on_group(u, 0) || + test_auto_submit_wait_on_group(u, 1) || + test_auto_submit_and_wait(u, 0) || + test_auto_submit_and_wait(u, 1) || + test_auto_submit_and_wait(u, 0) || + test_auto_submit_and_wait(u, 1)) + { + std::cout << "FAIL\n"; + return 1; + } + else + { + std::cout << "PASS\n"; + return 0; + } +#else + std::cout << "SKIPPED\n"; + return 0; +#endif // TEST_DYNAMIC_SELECTION_AVAILABLE +} From 268f0d59bd38efdebfd8dc5b6ddaa9d3ba5f2787 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Wed, 17 Jan 2024 13:55:49 -0600 Subject: [PATCH 03/30] Fixing Autotune bug --- .../dynamic_selection_impl/auto_tune_policy.h | 11 +++-- .../dynamic_selection_impl/sycl_backend.h | 40 ++++++++++++++----- ...est_auto_tune_policy_sycl_profile.pass.cpp | 12 +++--- 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index ac49493fd84..011b6dba0c0 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -120,21 +120,21 @@ class auto_tune_policy std::unique_lock l(m_); auto index = r.index_; timing_t new_value = t; - if (time_by_index_.count(index) == 0) + /*if (time_by_index_.count(index) == 0) { // ignore the 1st timing to cover for JIT compilation time_by_index_[index] = time_data_t{0, std::numeric_limits::max()}; std::cout<<"This timing is ignored\n"; } else - { + {*/ auto& td = time_by_index_[index]; auto n = td.num_timings_; new_value = (n * td.value_ + t) / (n + 1); td.num_timings_ = n + 1; td.value_ = new_value; - } + //} if (new_value < best_timing_) { best_timing_ = new_value; @@ -171,6 +171,7 @@ class auto_tune_policy void report(const execution_info::task_time_t&, const typename execution_info::task_time_t::value_type& v) const { + std::cout<<"Autotune report : "<add_new_timing(resource_, v); } }; @@ -223,14 +224,16 @@ class auto_tune_policy auto k = make_task_key(std::forward(f), std::forward(args)...); auto t = state_->tuner_by_key_[k]; auto index = t->get_resource_to_profile(); - std::cout<<"Device used : "<best_resource_.index_<<"\n"; return selection_type{*this, t->best_resource_, t}; } else { auto r = state_->resources_with_index_[index]; + std::cout<<"Trial Device used : "< #include #include +#include namespace oneapi { @@ -80,6 +81,7 @@ class sycl_backend { initialize_default_resources(); sgroup_ptr_ = std::make_unique(global_rank_); + number_of_resources=global_rank_.size(); } template @@ -91,6 +93,7 @@ class sycl_backend global_rank_.push_back(e); } sgroup_ptr_ = std::make_unique(global_rank_); + number_of_resources=global_rank_.size(); } template @@ -115,6 +118,7 @@ class sycl_backend } } auto e1 = f(q, std::forward(args)...); + break; auto e2 = q.submit([=](sycl::handler& h) { h.depends_on(e1); h.host_task([=]() { @@ -124,18 +128,35 @@ class sycl_backend } if constexpr (report_value_v) { - if (use_event_profiling) - { - cl_ulong time_start = e1.template get_profiling_info(); - cl_ulong time_end = e1.template get_profiling_info(); - std::cout<<"Total time : "<(); + //std::cout<<" Time start : "<(); + + //std::cout<<"Time end : "< number_of_resources;; resource_container_t global_rank_; std::unique_ptr sgroup_ptr_; diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp index 229f4bd11c2..b8cc1f686f0 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp @@ -115,12 +115,14 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) ecount += i; if (j == 0) { + std::cout<<"Device 0\n"; return q.submit([&](sycl::handler& h){ h.single_task([=](){}); }); } else { + std::cout<<"Device 1\n"; return q.submit([&](sycl::handler& h) { double *d_v = sycl::malloc_device(1000000, q); q.memcpy(d_v, v.data(), bytes).wait(); @@ -451,7 +453,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) } static inline void -build_auto_tune_universe1(std::vector& u) +build_auto_tune_universe(std::vector& u) { auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; try @@ -486,7 +488,7 @@ main() #if TEST_DYNAMIC_SELECTION_AVAILABLE using policy_t = oneapi::dpl::experimental::auto_tune_policy; std::vector u; - build_auto_tune_universe1(u); + build_auto_tune_universe(u); //If building the universe is not a success, return if (u.size() == 0 || u.size()==0) @@ -495,7 +497,7 @@ main() constexpr bool just_call_submit = false; constexpr bool call_select_before_submit = true; - if (test_auto_submit_wait_on_event(u, 0) || + if (test_auto_submit_wait_on_event(u, 0) /*|| test_auto_submit_wait_on_event(u, 1) || test_auto_submit_wait_on_event(u, 0) || test_auto_submit_wait_on_event(u, 1) || @@ -503,7 +505,7 @@ main() test_auto_submit_wait_on_group(u, 1) || test_auto_submit_wait_on_group(u, 0) || test_auto_submit_wait_on_group(u, 1) || - test_auto_submit_and_wait(u, 0) || + test_auto_submit_and_wait(u, 0) || test_auto_submit_and_wait(u, 1)|| test_auto_submit_and_wait(u, 0) || test_auto_submit_and_wait(u, 1) || @@ -519,7 +521,7 @@ main() test_auto_submit_and_wait(u, 0) || test_auto_submit_and_wait(u, 1) || test_auto_submit_and_wait(u, 0) || - test_auto_submit_and_wait(u, 1)) + test_auto_submit_and_wait(u, 1)*/) { std::cout << "FAIL\n"; return 1; From 0dda71219458770ebc612051d9dc181d0121cd45 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Thu, 21 Mar 2024 14:08:16 -0500 Subject: [PATCH 04/30] Added sycl profiling without host task --- .../dynamic_selection_impl/auto_tune_policy.h | 31 +- .../dynamic_selection_impl/sycl_backend.h | 120 ++-- .../dpl/internal/dynamic_selection_traits.h | 25 + .../sycl/test_auto_tune_policy_sycl.pass.cpp | 180 +++--- ...est_auto_tune_policy_sycl_profile.pass.cpp | 538 ------------------ test/support/test_dynamic_selection_utils.h | 1 + 6 files changed, 226 insertions(+), 669 deletions(-) delete mode 100644 test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 011b6dba0c0..215e50fc72d 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -120,21 +120,12 @@ class auto_tune_policy std::unique_lock l(m_); auto index = r.index_; timing_t new_value = t; - /*if (time_by_index_.count(index) == 0) - { - // ignore the 1st timing to cover for JIT compilation - time_by_index_[index] = time_data_t{0, std::numeric_limits::max()}; - std::cout<<"This timing is ignored\n"; - } - else - {*/ - auto& td = time_by_index_[index]; - auto n = td.num_timings_; - new_value = (n * td.value_ + t) / (n + 1); - td.num_timings_ = n + 1; - td.value_ = new_value; - - //} + auto& td = time_by_index_[index]; + auto n = td.num_timings_; + new_value = (n * td.value_ + t) / (n + 1); + td.num_timings_ = n + 1; + td.value_ = new_value; + if (new_value < best_timing_) { best_timing_ = new_value; @@ -153,8 +144,7 @@ class auto_tune_policy public: auto_tune_selection_type(const policy_t& p, resource_with_index_t r, std::shared_ptr t) : policy_(p), resource_(r), tuner_(::std::move(t)) - { - } + {} auto unwrap() @@ -171,7 +161,6 @@ class auto_tune_policy void report(const execution_info::task_time_t&, const typename execution_info::task_time_t::value_type& v) const { - std::cout<<"Autotune report : "<add_new_timing(resource_, v); } }; @@ -224,16 +213,16 @@ class auto_tune_policy auto k = make_task_key(std::forward(f), std::forward(args)...); auto t = state_->tuner_by_key_[k]; auto index = t->get_resource_to_profile(); - //std::cout<<"Device used : "<best_resource_.index_<<"\n"; return selection_type{*this, t->best_resource_, t}; } else { + if constexpr(has_lazy_report::value){ + backend_->lazy_report(); + } auto r = state_->resources_with_index_[index]; - std::cout<<"Trial Device used : "<; private: - class async_waiter + class storage_base{ + public: + virtual void wait() = 0; + virtual void report() = 0; + virtual bool is_complete() = 0; + }; + + std::vector storage_arr; + + template + void addStorage(T *t){ + storage_arr.push_back(t); + } + + template + class async_waiter : public storage_base { sycl::event e_; + Selection* s; + std::optional timing; public: - async_waiter(sycl::event e) : e_(e) {} + async_waiter(sycl::event e) : e_(e){} + async_waiter(sycl::event e, Selection* selection, std::optional t=std::nullopt) : e_(e), s(selection), timing(t) {} sycl::event unwrap() { return e_; } + void - wait() + wait() override { e_.wait(); } + + void + report() override{ + if constexpr (report_value_v){ + if (!timing.has_value()) + { + cl_ulong time_start = e_.template get_profiling_info(); + cl_ulong time_end = e_.template get_profiling_info(); + s->report(execution_info::task_time, time_end-time_start); + }else{ + auto t = timing.value(); + s->report(execution_info::task_time, (std::chrono::steady_clock::now() - t).count()); + } + } + + } + + bool + is_complete() override{ + return e_.get_info() == sycl::info::event_command_status::complete; + } + }; + class submission_group { resource_container_t resources_; @@ -93,7 +135,6 @@ class sycl_backend global_rank_.push_back(e); } sgroup_ptr_ = std::make_unique(global_rank_); - number_of_resources=global_rank_.size(); } template @@ -118,52 +159,32 @@ class sycl_backend } } auto e1 = f(q, std::forward(args)...); - break; - auto e2 = q.submit([=](sycl::handler& h) { - h.depends_on(e1); - h.host_task([=]() { - if constexpr (report_info_v) - { + if constexpr(report_info_v){ + auto e2 = q.submit([=](sycl::handler& h){ + h.depends_on(e1); + h.host_task([=](){ s.report(execution_info::task_completion); - } - if constexpr (report_value_v) - { - if(number_of_resources==0){ - if (use_event_profiling) - { - try{ - cl_ulong time_start = e1.template get_profiling_info(); - //std::cout<<" Time start : "<(); - - //std::cout<<"Time end : "<){ + if (use_event_profiling) + { + auto waiter = async_waiter{e1, new SelectionHandle(s)}; + addStorage(new async_waiter(waiter)); + return waiter; + } + else{ + auto waiter = async_waiter{e1, new SelectionHandle(s), t0}; + addStorage(new async_waiter(waiter)); + return waiter; + } + } } else { - return async_waiter{f(unwrap(s), std::forward(args)...)}; + return async_waiter{f(unwrap(s), std::forward(args)...), new SelectionHandle(s)}; } } @@ -179,6 +200,15 @@ class sycl_backend return global_rank_; } + void lazy_report(){ + int size = storage_arr.size(); + for(auto i = storage_arr.begin(); i!=storage_arr.begin()+size; i++){ + if((*i)->is_complete()){ + (*i)->report(); + storage_arr.erase(i); + } + } + } private: std::atomic number_of_resources;; resource_container_t global_rank_; diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index c1955a86c42..f61a0e29461 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -121,6 +121,18 @@ struct has_report_value : decltype(has_report_value_impl(0)) { }; +template +auto +has_lazy_report_impl(...) -> std::false_type; + +template +auto +has_lazy_report_impl() -> decltype(std::declval().lazy_report(), std::true_type{}); + +template +struct has_lazy_report : decltype(has_lazy_report_impl(0)) +{ +}; } //namespace internal struct deferred_initialization_t @@ -299,6 +311,19 @@ struct report_value template inline constexpr bool report_value_v = report_value::value; +template +auto +has_lazy_report_impl(...) -> std::false_type; + +template +auto +has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true_type{}); + +template +struct has_lazy_report : decltype(has_lazy_report_impl(0)) +{ +}; + } // namespace experimental } // namespace dpl } // namespace oneapi diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp index 85bfd27f931..56d4ebae1b7 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp @@ -108,11 +108,13 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) } } ecount += i; - if (*j == 0) + if (*j == 0) { - return sycl::event{}; - } - else + return q.submit([=](sycl::handler& h){ + h.single_task([](){}); + }); + } + else { return q.submit([=](sycl::handler& h) { h.parallel_for()>>( 1000000, [=](sycl::id<1> idx) { @@ -169,6 +175,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) } }); }); + return x; } }); oneapi::dpl::experimental::wait(s); @@ -250,11 +257,13 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource) } } ecount += i; - if (*j == 0) + if (*j == 0) { - return sycl::event{}; - } - else + return q.submit([=](sycl::handler& h){ + h.single_task([](){}); + }); + } + else { return q.submit([=](sycl::handler& h) { h.parallel_for()>>( 1000000, [=](sycl::id<1> idx) { @@ -452,6 +468,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) } }); }); + return x; } }); } @@ -479,14 +496,20 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) return 0; } + +template static inline void build_auto_tune_universe(std::vector& u) { + auto prop_list = sycl::property_list{}; + if(use_event_profiling){ + prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; + } + try { auto device_cpu1 = sycl::device(sycl::cpu_selector_v); - sycl::queue cpu1_queue(device_cpu1); - run_sycl_sanity_test(cpu1_queue); + sycl::queue cpu1_queue{device_cpu1, prop_list}; u.push_back(cpu1_queue); } catch (const sycl::exception&) @@ -496,8 +519,7 @@ build_auto_tune_universe(std::vector& u) try { auto device_cpu2 = sycl::device(sycl::cpu_selector_v); - sycl::queue cpu2_queue(device_cpu2); - run_sycl_sanity_test(cpu2_queue); + sycl::queue cpu2_queue{device_cpu2, prop_list}; u.push_back(cpu2_queue); } catch (const sycl::exception&) @@ -507,8 +529,7 @@ build_auto_tune_universe(std::vector& u) try { auto device_cpu3 = sycl::device(sycl::cpu_selector_v); - sycl::queue cpu3_queue(device_cpu3); - run_sycl_sanity_test(cpu3_queue); + sycl::queue cpu3_queue{device_cpu3, prop_list}; u.push_back(cpu3_queue); } catch (const sycl::exception&) @@ -518,8 +539,7 @@ build_auto_tune_universe(std::vector& u) try { auto device_cpu4 = sycl::device(sycl::cpu_selector_v); - sycl::queue cpu4_queue(device_cpu4); - run_sycl_sanity_test(cpu4_queue); + sycl::queue cpu4_queue{device_cpu4, prop_list}; u.push_back(cpu4_queue); } catch (const sycl::exception&) @@ -527,6 +547,7 @@ build_auto_tune_universe(std::vector& u) std::cout << "SKIPPED: Unable to run with cpu_selector\n"; } } + #endif int @@ -534,49 +555,78 @@ main() { #if TEST_DYNAMIC_SELECTION_AVAILABLE using policy_t = oneapi::dpl::experimental::auto_tune_policy; - std::vector u; - build_auto_tune_universe(u); + std::vector u1; + std::vector u2; + constexpr bool use_event_profiling = true; + build_auto_tune_universe(u1); + build_auto_tune_universe(u2); //If building the universe is not a success, return - if (u.size() == 0) + if (u1.size() == 0) return 0; - auto f = [u](int i) { + auto f = [u1](int i) { if (i <= 8) - return u[(i - 1) % 4]; + return u1[(i - 1) % 4]; else - return u[0]; + return u1[0]; }; constexpr bool just_call_submit = false; constexpr bool call_select_before_submit = true; - if (test_auto_initialization(u) || test_select(u, f) || - test_auto_submit_wait_on_event(u, 0) || - test_auto_submit_wait_on_event(u, 1) || - test_auto_submit_wait_on_event(u, 2) || - test_auto_submit_wait_on_event(u, 3) || - test_auto_submit_wait_on_group(u, 0) || - test_auto_submit_wait_on_group(u, 1) || - test_auto_submit_wait_on_group(u, 2) || - test_auto_submit_wait_on_group(u, 3) || - test_auto_submit_and_wait(u, 0) || - test_auto_submit_and_wait(u, 1) || - test_auto_submit_and_wait(u, 2) || - test_auto_submit_and_wait(u, 3) + if (test_auto_initialization(u1) || test_select(u1, f) || + test_auto_submit_wait_on_event(u1, 0) || + test_auto_submit_wait_on_event(u1, 1) || + test_auto_submit_wait_on_event(u1, 2) || + test_auto_submit_wait_on_event(u1, 3) || + test_auto_submit_wait_on_group(u1, 0) || + test_auto_submit_wait_on_group(u1, 1) || + test_auto_submit_wait_on_group(u1, 2) || + test_auto_submit_wait_on_group(u1, 3) || + test_auto_submit_and_wait(u1, 0) || + test_auto_submit_and_wait(u1, 1) || + test_auto_submit_and_wait(u1, 2) || + test_auto_submit_and_wait(u1, 3) || + // now select then submits + test_auto_submit_wait_on_event(u1, 0) || + test_auto_submit_wait_on_event(u1, 1) || + test_auto_submit_wait_on_event(u1, 2) || + test_auto_submit_wait_on_event(u1, 3) || + test_auto_submit_wait_on_group(u1, 0) || + test_auto_submit_wait_on_group(u1, 1) || + test_auto_submit_wait_on_group(u1, 2) || + test_auto_submit_wait_on_group(u1, 3) || + test_auto_submit_and_wait(u1, 0) || + test_auto_submit_and_wait(u1, 1) || + test_auto_submit_and_wait(u1, 2) || + test_auto_submit_and_wait(u1, 3) || + //Use event profiling + test_auto_submit_wait_on_event(u2, 0) || + test_auto_submit_wait_on_event(u2, 1) || + test_auto_submit_wait_on_event(u2, 2) || + test_auto_submit_wait_on_event(u2, 3) || + test_auto_submit_wait_on_group(u2, 0) || + test_auto_submit_wait_on_group(u2, 1) || + test_auto_submit_wait_on_group(u2, 2) || + test_auto_submit_wait_on_group(u2, 3) || + test_auto_submit_and_wait(u2, 0) || + test_auto_submit_and_wait(u2, 1) || + test_auto_submit_and_wait(u2, 2) || + test_auto_submit_and_wait(u2, 3) || // now select then submits - || test_auto_submit_wait_on_event(u, 0) || - test_auto_submit_wait_on_event(u, 1) || - test_auto_submit_wait_on_event(u, 2) || - test_auto_submit_wait_on_event(u, 3) || - test_auto_submit_wait_on_group(u, 0) || - test_auto_submit_wait_on_group(u, 1) || - test_auto_submit_wait_on_group(u, 2) || - test_auto_submit_wait_on_group(u, 3) || - test_auto_submit_and_wait(u, 0) || - test_auto_submit_and_wait(u, 1) || - test_auto_submit_and_wait(u, 2) || - test_auto_submit_and_wait(u, 3)) + test_auto_submit_wait_on_event(u2, 0) || + test_auto_submit_wait_on_event(u2, 1) || + test_auto_submit_wait_on_event(u2, 2) || + test_auto_submit_wait_on_event(u2, 3) || + test_auto_submit_wait_on_group(u2, 0) || + test_auto_submit_wait_on_group(u2, 1) || + test_auto_submit_wait_on_group(u2, 2) || + test_auto_submit_wait_on_group(u2, 3) || + test_auto_submit_and_wait(u2, 0) || + test_auto_submit_and_wait(u2, 1) || + test_auto_submit_and_wait(u2, 2) || + test_auto_submit_and_wait(u2, 3)) { std::cout << "FAIL\n"; return 1; diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp deleted file mode 100644 index b8cc1f686f0..00000000000 --- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp +++ /dev/null @@ -1,538 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Copyright (C) 2023 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include "oneapi/dpl/dynamic_selection" -#include "support/test_dynamic_selection_utils.h" -#include "support/test_config.h" -#if TEST_DYNAMIC_SELECTION_AVAILABLE -# include "support/sycl_sanity.h" - -template -int -test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) -{ - using my_policy_t = Policy; - - int j; - std::vector v(1000000, 0.0); - - my_policy_t p{u}; - auto n_samples = u.size(); - - const int N = 10; - std::atomic ecount = 0; - bool pass = true; - - for (int i = 1; i <= N; ++i) - { - if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource) - { - j = 100; - } - else - { - j = 0; - } - const size_t bytes = 1000000 * sizeof(double); - if constexpr (call_select_before_submit) - { - auto f = [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { - if (i <= 2 * n_samples) - { - // we should be round-robining through the resources - if (q != u[(i - 1) % n_samples]) - { - std::cout << i << ": mismatch during rr phase\n" << std::flush; - pass = false; - } - } - else - { - if (q != u[best_resource]) - { - std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; - pass = false; - } - } - ecount += i; - if (j == 0) - { - return q.submit([&](sycl::handler& h) - { - h.single_task([=](){}); - }); - } - else - { - return q.submit([&](sycl::handler& h) { - double *d_v = sycl::malloc_device(1000000, q); - q.memcpy(d_v, v.data(), bytes).wait(); - q.parallel_for( - 1000000, [=](sycl::id<1> idx) { - for (int j0 = 0; j0 < j; ++j0) - { - d_v[idx] += idx; - } - }); - q.memcpy(v.data(), d_v, bytes).wait(); - }); - } - }; - auto s = oneapi::dpl::experimental::select(p, f); - auto e = oneapi::dpl::experimental::submit(s, f); - oneapi::dpl::experimental::wait(e); - } - else - { - // it's ok to capture by reference since we are waiting on each call - auto s = oneapi::dpl::experimental::submit( - p, [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { - if (i <= 2 * n_samples) - { - // we should be round-robining through the resources - if (q != u[(i - 1) % n_samples]) - { - std::cout << i << ": mismatch during rr phase\n" << std::flush; - pass = false; - } - } - else - { - if (q != u[best_resource]) - { - std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; - pass = false; - } - } - ecount += i; - if (j == 0) - { - std::cout<<"Device 0\n"; - return q.submit([&](sycl::handler& h){ - h.single_task([=](){}); - }); - } - else - { - std::cout<<"Device 1\n"; - return q.submit([&](sycl::handler& h) { - double *d_v = sycl::malloc_device(1000000, q); - q.memcpy(d_v, v.data(), bytes).wait(); - q.parallel_for( - 1000000, [=](sycl::id<1> idx) { - for (int j0 = 0; j0 < j; ++j0) - { - d_v[idx] += idx; - } - }); - q.memcpy(v.data(), d_v, bytes).wait(); - }); - } - }); - oneapi::dpl::experimental::wait(s); - } - - int count = ecount.load(); - if (count != i * (i + 1) / 2) - { - std::cout << "ERROR: scheduler did not execute all tasks exactly once\n"; - return 1; - } - } - if (!pass) - { - std::cout << "ERROR: did not select expected resources\n"; - return 1; - } - if constexpr (call_select_before_submit) - { - std::cout << "select then submit and wait on event: OK\n"; - } - else - { - std::cout << "submit and wait on event: OK\n"; - } - return 0; -} - -template -int -test_auto_submit_wait_on_group(UniverseContainer u, int best_resource) -{ - using my_policy_t = Policy; - - int j; - std::vector v(1000000, 0.0); - - my_policy_t p{u}; - auto n_samples = u.size(); - - const int N = 10; - std::atomic ecount = 0; - bool pass = true; - - for (int i = 1; i <= N; ++i) - { - if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource) - { - j = 100; - } - else - { - j = 0; - } - const size_t bytes = 1000000 * sizeof(double); - if constexpr (call_select_before_submit) - { - auto f = [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { - if (i <= 2 * n_samples) - { - // we should be round-robining through the resources - if (q != u[(i - 1) % n_samples]) - { - std::cout << i << ": mismatch during rr phase\n" << std::flush; - pass = false; - } - } - else - { - if (q != u[best_resource]) - { - std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; - pass = false; - } - } - ecount += i; - if (j == 0) - { - return q.submit([=](sycl::handler& h){ - h.single_task([=](){}); - }); - } - else - { - return q.submit([&](sycl::handler& h) { - double *d_v = sycl::malloc_device(1000000, q); - q.memcpy(d_v, v.data(), bytes).wait(); - h.parallel_for( - 1000000, [=](sycl::id<1> idx) { - for (int j0 = 0; j0 < j; ++j0) - { - d_v[idx] += idx; - } - }); - q.memcpy(v.data(), d_v, bytes).wait(); - }); - } - }; - auto s = oneapi::dpl::experimental::select(p, f); - auto e = oneapi::dpl::experimental::submit(s, f); - oneapi::dpl::experimental::wait(p.get_submission_group()); - } - else - { - // it's ok to capture by reference since we are waiting on each call - auto s = oneapi::dpl::experimental::submit( - p, [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { - if (i <= 2 * n_samples) - { - // we should be round-robining through the resources - if (q != u[(i - 1) % n_samples]) - { - std::cout << i << ": mismatch during rr phase\n" << std::flush; - pass = false; - } - } - else - { - if (q != u[best_resource]) - { - std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; - pass = false; - } - } - ecount += i; - if (j == 0) - { - return q.submit([=](sycl::handler& h){ - h.single_task([=](){}); - }); - } - else - { - return q.submit([&](sycl::handler& h) { - double *d_v = sycl::malloc_device(1000000, q); - q.memcpy(d_v, v.data(), bytes).wait(); - h.parallel_for( - 1000000, [=](sycl::id<1> idx) { - for (int j0 = 0; j0 < j; ++j0) - { - d_v[idx] += idx; - } - }); - q.memcpy(v.data(), d_v, bytes).wait(); - }); - } - }); - oneapi::dpl::experimental::wait(p.get_submission_group()); - } - - int count = ecount.load(); - if (count != i * (i + 1) / 2) - { - std::cout << "ERROR: scheduler did not execute all tasks exactly once\n"; - return 1; - } - } - if (!pass) - { - std::cout << "ERROR: did not select expected resources\n"; - return 1; - } - if constexpr (call_select_before_submit) - { - std::cout << "select then submit and wait on group: OK\n"; - } - else - { - std::cout << "submit and wait on group: OK\n"; - } - return 0; -} - -template -int -test_auto_submit_and_wait(UniverseContainer u, int best_resource) -{ - using my_policy_t = Policy; - - // they are cpus so this is ok - int j; - std::vector v(1000000, 0.0); - - my_policy_t p{u}; - auto n_samples = u.size(); - - const int N = 10; - std::atomic ecount = 0; - bool pass = true; - - for (int i = 1; i <= N; ++i) - { - if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource) - { - j = 100; - } - else - { - j = 0; - } - const size_t bytes = 1000000 * sizeof(double); - if constexpr (call_select_before_submit) - { - auto f = [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { - if (i <= 2 * n_samples) - { - // we should be round-robining through the resources - if (q != u[(i - 1) % n_samples]) - { - std::cout << i << ": mismatch during rr phase\n" << std::flush; - pass = false; - } - } - else - { - if (q != u[best_resource]) - { - std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; - pass = false; - } - } - ecount += i; - if (j == 0) - { - return q.submit([=](sycl::handler& h){ - h.single_task([=](){}); - }); - } - else - { - return q.submit([&](sycl::handler& h) { - double *d_v = sycl::malloc_device(1000000, q); - q.memcpy(d_v, v.data(), bytes).wait(); - h.parallel_for( - 1000000, [=](sycl::id<1> idx) { - for (int j0 = 0; j0 < j; ++j0) - { - d_v[idx] += idx; - } - }); - q.memcpy(v.data(), d_v, bytes).wait(); - }); - } - }; - auto s = oneapi::dpl::experimental::select(p, f); - oneapi::dpl::experimental::submit_and_wait(s, f); - } - else - { - // it's ok to capture by reference since we are waiting on each call - oneapi::dpl::experimental::submit_and_wait( - p, [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { - if (i <= 2 * n_samples) - { - // we should be round-robining through the resources - if (q != u[(i - 1) % n_samples]) - { - std::cout << i << ": mismatch during rr phase\n" << std::flush; - pass = false; - } - } - else - { - if (q != u[best_resource]) - { - std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush; - pass = false; - } - } - ecount += i; - if (j == 0) - { - return q.submit([=](sycl::handler& h){ - h.single_task([=](){}); - }); - } - else - { - return q.submit([&](sycl::handler& h) { - double *d_v = sycl::malloc_device(1000000, q); - q.memcpy(d_v, v.data(), bytes).wait(); - h.parallel_for( - 1000000, [=](sycl::id<1> idx) { - for (int j0 = 0; j0 < j; ++j0) - { - d_v[idx] += idx; - } - }); - q.memcpy(v.data(), d_v, bytes).wait(); - }); - } - }); - } - - int count = ecount.load(); - if (count != i * (i + 1) / 2) - { - std::cout << "ERROR: scheduler did not execute all tasks exactly once\n"; - return 1; - } - } - if (!pass) - { - std::cout << "ERROR: did not select expected resources\n"; - return 1; - } - if constexpr (call_select_before_submit) - { - std::cout << "select then submit_and_wait: OK\n"; - } - else - { - std::cout << "submit_and_wait: OK\n"; - } - return 0; -} - -static inline void -build_auto_tune_universe(std::vector& u) -{ - auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; - try - { - auto device_cpu = sycl::device(sycl::cpu_selector_v); - sycl::queue cpu_queue{device_cpu, prop_list}; - run_sycl_sanity_test(cpu_queue); - u.push_back(cpu_queue); - } - catch (const sycl::exception&) - { - std::cout << "SKIPPED: Unable to run with cpu_selector\n"; - } - try - { - auto device_gpu = sycl::device(sycl::gpu_selector_v); - sycl::queue gpu_queue{device_gpu, prop_list}; - run_sycl_sanity_test(gpu_queue); - u.push_back(gpu_queue); - } - catch (const sycl::exception&) - { - std::cout << "SKIPPED: Unable to run with gpu_selector\n"; - } -} - -#endif - -int -main() -{ -#if TEST_DYNAMIC_SELECTION_AVAILABLE - using policy_t = oneapi::dpl::experimental::auto_tune_policy; - std::vector u; - build_auto_tune_universe(u); - - //If building the universe is not a success, return - if (u.size() == 0 || u.size()==0) - return 0; - - constexpr bool just_call_submit = false; - constexpr bool call_select_before_submit = true; - - if (test_auto_submit_wait_on_event(u, 0) /*|| - test_auto_submit_wait_on_event(u, 1) || - test_auto_submit_wait_on_event(u, 0) || - test_auto_submit_wait_on_event(u, 1) || - test_auto_submit_wait_on_group(u, 0) || - test_auto_submit_wait_on_group(u, 1) || - test_auto_submit_wait_on_group(u, 0) || - test_auto_submit_wait_on_group(u, 1) || - test_auto_submit_and_wait(u, 0) || - test_auto_submit_and_wait(u, 1)|| - test_auto_submit_and_wait(u, 0) || - test_auto_submit_and_wait(u, 1) || - // now select then submits - test_auto_submit_wait_on_event(u, 0) || - test_auto_submit_wait_on_event(u, 1) || - test_auto_submit_wait_on_event(u, 0) || - test_auto_submit_wait_on_event(u, 1) || - test_auto_submit_wait_on_group(u, 0) || - test_auto_submit_wait_on_group(u, 1) || - test_auto_submit_wait_on_group(u, 0) || - test_auto_submit_wait_on_group(u, 1) || - test_auto_submit_and_wait(u, 0) || - test_auto_submit_and_wait(u, 1) || - test_auto_submit_and_wait(u, 0) || - test_auto_submit_and_wait(u, 1)*/) - { - std::cout << "FAIL\n"; - return 1; - } - else - { - std::cout << "PASS\n"; - return 0; - } -#else - std::cout << "SKIPPED\n"; - return 0; -#endif // TEST_DYNAMIC_SELECTION_AVAILABLE -} diff --git a/test/support/test_dynamic_selection_utils.h b/test/support/test_dynamic_selection_utils.h index dded1a1362f..3e35cd2242b 100644 --- a/test/support/test_dynamic_selection_utils.h +++ b/test/support/test_dynamic_selection_utils.h @@ -97,6 +97,7 @@ test_select(UniverseContainer u, ResourceFunction&& f) return 1; } } + if (!pass) { std::cout << "ERROR: did not select expected resources\n"; From a38327a1e5eb28b3e3204b7bd69a93d6e4e2482e Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 22 Mar 2024 10:42:22 -0500 Subject: [PATCH 05/30] Corrections in dynamic traits --- .../dynamic_selection_impl/auto_tune_policy.h | 6 ++--- .../dynamic_selection_impl/sycl_backend.h | 23 ++++++++----------- .../dpl/internal/dynamic_selection_traits.h | 12 ++++++++-- .../sycl/test_auto_tune_policy_sycl.pass.cpp | 15 ++++-------- test/support/test_dynamic_selection_utils.h | 1 - 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 215e50fc72d..eba8f7ec8ae 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -125,7 +125,6 @@ class auto_tune_policy new_value = (n * td.value_ + t) / (n + 1); td.num_timings_ = n + 1; td.value_ = new_value; - if (new_value < best_timing_) { best_timing_ = new_value; @@ -144,7 +143,8 @@ class auto_tune_policy public: auto_tune_selection_type(const policy_t& p, resource_with_index_t r, std::shared_ptr t) : policy_(p), resource_(r), tuner_(::std::move(t)) - {} + { + } auto unwrap() @@ -219,7 +219,7 @@ class auto_tune_policy } else { - if constexpr(has_lazy_report::value){ + if constexpr(lazy_report_v){ backend_->lazy_report(); } auto r = state_->resources_with_index_[index]; diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 279696cddc3..74b1bf7e3ba 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -19,7 +19,6 @@ #include #include #include -#include namespace oneapi { @@ -37,22 +36,22 @@ class sycl_backend using resource_container_t = std::vector; private: - class storage_base{ + class async_waiter_base{ public: virtual void wait() = 0; virtual void report() = 0; virtual bool is_complete() = 0; }; - std::vector storage_arr; + std::vector async_waiter_arr; template - void addStorage(T *t){ - storage_arr.push_back(t); + void add_waiter(T *t){ + async_waiter_arr.push_back(t); } template - class async_waiter : public storage_base + class async_waiter : public async_waiter_base { sycl::event e_; Selection* s; @@ -123,7 +122,6 @@ class sycl_backend { initialize_default_resources(); sgroup_ptr_ = std::make_unique(global_rank_); - number_of_resources=global_rank_.size(); } template @@ -172,12 +170,12 @@ class sycl_backend if (use_event_profiling) { auto waiter = async_waiter{e1, new SelectionHandle(s)}; - addStorage(new async_waiter(waiter)); + add_waiter(new async_waiter(waiter)); return waiter; } else{ auto waiter = async_waiter{e1, new SelectionHandle(s), t0}; - addStorage(new async_waiter(waiter)); + add_waiter(new async_waiter(waiter)); return waiter; } } @@ -201,16 +199,15 @@ class sycl_backend } void lazy_report(){ - int size = storage_arr.size(); - for(auto i = storage_arr.begin(); i!=storage_arr.begin()+size; i++){ + int size = async_waiter_arr.size(); + for(auto i = async_waiter_arr.begin(); i!=async_waiter_arr.begin()+size; i++){ if((*i)->is_complete()){ (*i)->report(); - storage_arr.erase(i); + async_waiter_arr.erase(i); } } } private: - std::atomic number_of_resources;; resource_container_t global_rank_; std::unique_ptr sgroup_ptr_; diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index f61a0e29461..0a63861bb38 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -127,7 +127,7 @@ has_lazy_report_impl(...) -> std::false_type; template auto -has_lazy_report_impl() -> decltype(std::declval().lazy_report(), std::true_type{}); +has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true_type{}); template struct has_lazy_report : decltype(has_lazy_report_impl(0)) @@ -311,7 +311,7 @@ struct report_value template inline constexpr bool report_value_v = report_value::value; -template +/*template auto has_lazy_report_impl(...) -> std::false_type; @@ -322,7 +322,15 @@ has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true template struct has_lazy_report : decltype(has_lazy_report_impl(0)) { +};*/ + +template +struct lazy_report_value +{ + static constexpr bool value = internal::has_lazy_report::value; }; +template +inline constexpr bool lazy_report_v = lazy_report_value::value; } // namespace experimental } // namespace dpl diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp index 56d4ebae1b7..3f8a0ca606e 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp @@ -157,15 +157,13 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) ecount += i; if (*j == 0) { - auto x = q.submit([=](sycl::handler& h){ + return q.submit([=](sycl::handler& h){ h.single_task([](){}); }); - return x; - } else { - auto x = q.submit([=](sycl::handler& h) { + return q.submit([=](sycl::handler& h) { h.parallel_for()>>( 1000000, [=](sycl::id<1> idx) { @@ -175,7 +173,6 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) } }); }); - return x; } }); oneapi::dpl::experimental::wait(s); @@ -451,14 +448,13 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) ecount += i; if (*j == 0) { - auto x = q.submit([=](sycl::handler& h){ + return q.submit([=](sycl::handler& h){ h.single_task([](){}); }); - return x; } else { - auto x = q.submit([=](sycl::handler& h) { + return q.submit([=](sycl::handler& h) { h.parallel_for()>>( 1000000, [=](sycl::id<1> idx) { @@ -468,7 +464,6 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) } }); }); - return x; } }); } @@ -562,7 +557,7 @@ main() build_auto_tune_universe(u2); //If building the universe is not a success, return - if (u1.size() == 0) + if (u1.size() == 0 || u2.size()==0) return 0; auto f = [u1](int i) { diff --git a/test/support/test_dynamic_selection_utils.h b/test/support/test_dynamic_selection_utils.h index 3e35cd2242b..dded1a1362f 100644 --- a/test/support/test_dynamic_selection_utils.h +++ b/test/support/test_dynamic_selection_utils.h @@ -97,7 +97,6 @@ test_select(UniverseContainer u, ResourceFunction&& f) return 1; } } - if (!pass) { std::cout << "ERROR: did not select expected resources\n"; From 70f993cbd03109d10b30cb6315b99e9ba56f4fd8 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 22 Mar 2024 10:59:15 -0500 Subject: [PATCH 06/30] Remove comments in dynamic_selection_traits --- .../oneapi/dpl/internal/dynamic_selection_traits.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index 0a63861bb38..b5b220cd1e5 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -311,19 +311,6 @@ struct report_value template inline constexpr bool report_value_v = report_value::value; -/*template -auto -has_lazy_report_impl(...) -> std::false_type; - -template -auto -has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true_type{}); - -template -struct has_lazy_report : decltype(has_lazy_report_impl(0)) -{ -};*/ - template struct lazy_report_value { From 69f5014511f88b97564e405ca112f87d180e8d56 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 22 Mar 2024 15:50:27 -0500 Subject: [PATCH 07/30] Adding back jit compolation restrictions --- .../dynamic_selection_impl/auto_tune_policy.h | 18 +++++++++---- .../dynamic_selection_impl/sycl_backend.h | 26 ++++++++----------- .../sycl/test_auto_tune_policy_sycl.pass.cpp | 1 - 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 3bcb0c52772..5fddd9b8467 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -122,11 +122,19 @@ class auto_tune_policy std::lock_guard l(m_); auto index = r.index_; timing_t new_value = t; - auto& td = time_by_index_[index]; - auto n = td.num_timings_; - new_value = (n * td.value_ + t) / (n + 1); - td.num_timings_ = n + 1; - td.value_ = new_value; + if (time_by_index_.count(index) == 0) + { + // ignore the 1st timing to cover for JIT compilation + time_by_index_[index] = time_data_t{0, std::numeric_limits::max()}; + } + else + { + auto& td = time_by_index_[index]; + auto n = td.num_timings_; + new_value = (n * td.value_ + t) / (n + 1); + td.num_timings_ = n + 1; + td.value_ = new_value; + } if (new_value < best_timing_) { best_timing_ = new_value; diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index b39b70e683a..a7e00f6e8cd 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -55,11 +55,9 @@ class sycl_backend { sycl::event e_; Selection* s; - std::optional timing; - public: async_waiter(sycl::event e) : e_(e){} - async_waiter(sycl::event e, Selection* selection, std::optional t=std::nullopt) : e_(e), s(selection), timing(t) {} + async_waiter(sycl::event e, Selection* selection) : e_(e), s(selection) {} sycl::event unwrap() @@ -76,15 +74,9 @@ class sycl_backend void report() override{ if constexpr (report_value_v){ - if (!timing.has_value()) - { - cl_ulong time_start = e_.template get_profiling_info(); - cl_ulong time_end = e_.template get_profiling_info(); - s->report(execution_info::task_time, time_end-time_start); - }else{ - auto t = timing.value(); - s->report(execution_info::task_time, (std::chrono::steady_clock::now() - t).count()); - } + cl_ulong time_start = e_.template get_profiling_info(); + cl_ulong time_end = e_.template get_profiling_info(); + s->report(execution_info::task_time, time_end-time_start); } } @@ -175,9 +167,13 @@ class sycl_backend return waiter; } else{ - auto waiter = async_waiter{e1, new SelectionHandle(s), t0}; - add_waiter(new async_waiter(waiter)); - return waiter; + auto e2 = q.submit([=](sycl::handler& h){ + h.depends_on(e1); + h.host_task([=](){ + s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count()); + }); + }); + return async_waiter{e2, new SelectionHandle(s)}; } } } diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp index bdb9bf74549..23702af4c02 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp @@ -513,7 +513,6 @@ main() bool bProcessed = false; #if TEST_DYNAMIC_SELECTION_AVAILABLE - std::cout<<"HERE\n"; using policy_t = oneapi::dpl::experimental::auto_tune_policy; std::vector u1; std::vector u2; From 757c4701cc62f05f81667899cb07e227a0cacc3d Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Wed, 27 Mar 2024 11:18:17 -0500 Subject: [PATCH 08/30] Adressing comments to add thread safety, better traits --- .../dynamic_selection_impl/auto_tune_policy.h | 6 +-- .../dynamic_selection_impl/sycl_backend.h | 40 ++++++++++++------- .../dpl/internal/dynamic_selection_traits.h | 18 +++++---- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 5fddd9b8467..8557f15fa3d 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -217,6 +217,9 @@ class auto_tune_policy select(Function&& f, Args&&... args) { static_assert(sizeof...(KeyArgs) == sizeof...(Args)); + if constexpr(backend_traits::lazy_report_v){ + backend_->lazy_report(); + } if (state_) { std::lock_guard l(state_->m_); @@ -229,9 +232,6 @@ class auto_tune_policy } else { - if constexpr(lazy_report_v){ - backend_->lazy_report(); - } auto r = state_->resources_with_index_[index]; return selection_type{*this, r, t}; } diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index a7e00f6e8cd..11a475ccbc6 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -43,12 +43,29 @@ class sycl_backend virtual bool is_complete() = 0; }; - std::vector async_waiter_arr; + struct async_waiter_arr{ - template - void add_waiter(T *t){ - async_waiter_arr.push_back(t); - } + std::mutex m_; + std::vector async_waiters; + + template + void add_waiter(T *t){ + std::lock_guard l(m_); + async_waiters.push_back(t); + } + + void lazy_report(){ + int size = async_waiters.size(); + for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){ + if((*i)->is_complete()){ + (*i)->report(); + async_waiters.erase(i); + } + } + } + }; + + async_waiter_arr async_waiter_arr; template class async_waiter : public async_waiter_base @@ -159,11 +176,11 @@ class sycl_backend }); return async_waiter{e2, new SelectionHandle(s)}; } - if constexpr(report_value_v){ + else if constexpr(report_value_v){ if (use_event_profiling) { auto waiter = async_waiter{e1, new SelectionHandle(s)}; - add_waiter(new async_waiter(waiter)); + async_waiter_arr.add_waiter(new async_waiter(waiter)); return waiter; } else{ @@ -196,14 +213,9 @@ class sycl_backend } void lazy_report(){ - int size = async_waiter_arr.size(); - for(auto i = async_waiter_arr.begin(); i!=async_waiter_arr.begin()+size; i++){ - if((*i)->is_complete()){ - (*i)->report(); - async_waiter_arr.erase(i); - } - } + async_waiter_arr.lazy_report(); } + private: resource_container_t global_rank_; std::unique_ptr sgroup_ptr_; diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index b5b220cd1e5..4d6117f9ef1 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -135,6 +135,16 @@ struct has_lazy_report : decltype(has_lazy_report_impl(0)) }; } //namespace internal +namespace backend_traits { + template + struct lazy_report_value + { + static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report::value; + }; + template + inline constexpr bool lazy_report_v = lazy_report_value::value; +} //namespace backend_traits + struct deferred_initialization_t { }; @@ -311,14 +321,6 @@ struct report_value template inline constexpr bool report_value_v = report_value::value; -template -struct lazy_report_value -{ - static constexpr bool value = internal::has_lazy_report::value; -}; -template -inline constexpr bool lazy_report_v = lazy_report_value::value; - } // namespace experimental } // namespace dpl } // namespace oneapi From 37a895a3c79d14dc8c6daf9890ea5fd0ef596258 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Wed, 27 Mar 2024 15:12:38 -0500 Subject: [PATCH 09/30] Added backend trait to check if profiling is enabled --- .../dynamic_selection_impl/auto_tune_policy.h | 4 +++- .../dynamic_selection_impl/sycl_backend.h | 4 ++++ .../dpl/internal/dynamic_selection_traits.h | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 8557f15fa3d..89e118feb0e 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -217,8 +217,10 @@ class auto_tune_policy select(Function&& f, Args&&... args) { static_assert(sizeof...(KeyArgs) == sizeof...(Args)); - if constexpr(backend_traits::lazy_report_v){ + if constexpr(backend_traits::lazy_report_v && backend_traits::enable_profiling_v){ + if(backend_->has_enable_profiling == true){ backend_->lazy_report(); + } } if (state_) { diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 11a475ccbc6..77f1d013d31 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -35,6 +35,7 @@ class sycl_backend using execution_resource_t = resource_type; using resource_container_t = std::vector; + bool has_enable_profiling = false; private: class async_waiter_base{ public: @@ -141,6 +142,9 @@ class sycl_backend for (auto e : v) { global_rank_.push_back(e); + if(e.template has_property()){ + has_enable_profiling = true; + } } sgroup_ptr_ = std::make_unique(global_rank_); } diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index 4d6117f9ef1..1274c1df301 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -133,6 +133,13 @@ template struct has_lazy_report : decltype(has_lazy_report_impl(0)) { }; + +template +struct has_enable_profiling : std::false_type { }; + +// Specialization for U = int +template +struct has_enable_profiling : std::true_type { }; } //namespace internal namespace backend_traits { @@ -143,6 +150,14 @@ namespace backend_traits { }; template inline constexpr bool lazy_report_v = lazy_report_value::value; + + template + struct enable_profiling_value + { + static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling::value; + }; + template + inline constexpr bool enable_profiling_v = enable_profiling_value::value; } //namespace backend_traits struct deferred_initialization_t From d9fe635f664f790d8a42a6c24325f68120da7ae7 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Thu, 4 Apr 2024 11:43:16 -0500 Subject: [PATCH 10/30] Backend traits, renaming and adding a lock to lazy report --- .../dynamic_selection_impl/auto_tune_policy.h | 1 + .../dynamic_selection_impl/backend_traits.h | 66 +++++++++++++++++++ .../dynamic_load_policy.h | 5 ++ .../dynamic_selection_impl/sycl_backend.h | 53 +++++++-------- .../dpl/internal/dynamic_selection_traits.h | 37 ----------- 5 files changed, 99 insertions(+), 63 deletions(-) create mode 100644 include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 89e118feb0e..f551ecb77dc 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -22,6 +22,7 @@ #include #include #include "oneapi/dpl/internal/dynamic_selection_traits.h" +#include "oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h" #if _DS_BACKEND_SYCL != 0 # include "oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h" #endif diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h new file mode 100644 index 00000000000..91181ea2dd8 --- /dev/null +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h @@ -0,0 +1,66 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Copyright (C) 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_INTERNAL_BACKEND_TRAITS_H +#define _ONEDPL_INTERNAL_BACKEND_TRAITS_H + +#include "oneapi/dpl/internal/dynamic_selection_traits.h" + +namespace oneapi +{ +namespace dpl +{ +namespace experimental +{ +namespace internal +{ + template + auto + has_lazy_report_impl(...) -> std::false_type; + + template + auto + has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true_type{}); + + template + struct has_lazy_report : decltype(has_lazy_report_impl(0)) + { + }; + + template + struct has_enable_profiling : std::false_type { }; + + // Specialization for U = int + template + struct has_enable_profiling : std::true_type { }; +}//namespace internal + +namespace backend_traits { + template + struct lazy_report_value + { + static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report::value; + }; + template + inline constexpr bool lazy_report_v = lazy_report_value::value; + + template + struct enable_profiling_value + { + static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling::value; + }; + template + inline constexpr bool enable_profiling_v = enable_profiling_value::value; +} //namespace backend_traits + +} // namespace experimental +} // namespace dpl +} // namespace oneapi + +#endif /*_ONEDPL_INTERNAL_BACKEND_TRAITS_H*/ diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h index d865b9b7fba..0886facb02b 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h @@ -150,6 +150,11 @@ struct dynamic_load_policy selection_type select(Args&&...) { + if constexpr(backend_traits::lazy_report_v && backend_traits::enable_profiling_v){ + if(backend_->has_enable_profiling == true){ + backend_->lazy_report(); + } + } if (state_) { std::lock_guard l(state_->m_); diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 77f1d013d31..88aa63ea231 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -44,30 +44,6 @@ class sycl_backend virtual bool is_complete() = 0; }; - struct async_waiter_arr{ - - std::mutex m_; - std::vector async_waiters; - - template - void add_waiter(T *t){ - std::lock_guard l(m_); - async_waiters.push_back(t); - } - - void lazy_report(){ - int size = async_waiters.size(); - for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){ - if((*i)->is_complete()){ - (*i)->report(); - async_waiters.erase(i); - } - } - } - }; - - async_waiter_arr async_waiter_arr; - template class async_waiter : public async_waiter_base { @@ -106,6 +82,31 @@ class sycl_backend }; + struct async_waiter_list_t{ + + std::mutex m_; + std::vector async_waiters; + + template + void add_waiter(T *t){ + std::lock_guard l(m_); + async_waiters.push_back(t); + } + + void lazy_report(){ + std::lock_guard l(m_); + int size = async_waiters.size(); + for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){ + if((*i)->is_complete()){ + (*i)->report(); + async_waiters.erase(i); + } + } + } + }; + + async_waiter_list_t async_waiter_list; + class submission_group { @@ -184,7 +185,7 @@ class sycl_backend if (use_event_profiling) { auto waiter = async_waiter{e1, new SelectionHandle(s)}; - async_waiter_arr.add_waiter(new async_waiter(waiter)); + async_waiter_list.add_waiter(new async_waiter(waiter)); return waiter; } else{ @@ -217,7 +218,7 @@ class sycl_backend } void lazy_report(){ - async_waiter_arr.lazy_report(); + async_waiter_list.lazy_report(); } private: diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index 1274c1df301..c1955a86c42 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -121,45 +121,8 @@ struct has_report_value : decltype(has_report_value_impl(0)) { }; -template -auto -has_lazy_report_impl(...) -> std::false_type; - -template -auto -has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true_type{}); - -template -struct has_lazy_report : decltype(has_lazy_report_impl(0)) -{ -}; - -template -struct has_enable_profiling : std::false_type { }; - -// Specialization for U = int -template -struct has_enable_profiling : std::true_type { }; } //namespace internal -namespace backend_traits { - template - struct lazy_report_value - { - static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report::value; - }; - template - inline constexpr bool lazy_report_v = lazy_report_value::value; - - template - struct enable_profiling_value - { - static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling::value; - }; - template - inline constexpr bool enable_profiling_v = enable_profiling_value::value; -} //namespace backend_traits - struct deferred_initialization_t { }; From 95ee3acb42e816055601c3b7ff5d68f5e07e2b15 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Tue, 9 Apr 2024 16:26:51 -0500 Subject: [PATCH 11/30] Fixed memory leaks --- .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 88aa63ea231..66e5b77ac59 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -42,6 +42,7 @@ class sycl_backend virtual void wait() = 0; virtual void report() = 0; virtual bool is_complete() = 0; + virtual ~async_waiter_base(){} }; template @@ -85,20 +86,20 @@ class sycl_backend struct async_waiter_list_t{ std::mutex m_; - std::vector async_waiters; + std::vector> async_waiters; template void add_waiter(T *t){ std::lock_guard l(m_); - async_waiters.push_back(t); + async_waiters.push_back(std::unique_ptr(t)); } void lazy_report(){ std::lock_guard l(m_); int size = async_waiters.size(); for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){ - if((*i)->is_complete()){ - (*i)->report(); + if(i->get()->is_complete()){ + i->get()->report(); async_waiters.erase(i); } } From cd5b3182e59504f270604113d1e876e6fd35e797 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Mon, 15 Apr 2024 10:35:58 -0500 Subject: [PATCH 12/30] No nullptr for selection handle in async waiter constructor --- .../dynamic_selection_impl/sycl_backend.h | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 66e5b77ac59..c88238459df 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -42,18 +42,24 @@ class sycl_backend virtual void wait() = 0; virtual void report() = 0; virtual bool is_complete() = 0; - virtual ~async_waiter_base(){} + virtual ~async_waiter_base() = default; }; template class async_waiter : public async_waiter_base { sycl::event e_; - Selection* s; + std::shared_ptr s; public: - async_waiter(sycl::event e) : e_(e){} - async_waiter(sycl::event e, Selection* selection) : e_(e), s(selection) {} - + async_waiter(sycl::event e) : e_(e) {} + async_waiter(sycl::event e, std::shared_ptr selection) : e_(e), s(selection) {} + + async_waiter(async_waiter &w) : e_(w.e_), s(w.s) {} + async_waiter& operator=(async_waiter &w){ + s = w.s; + e_ = w.e_; + return *this; + } sycl::event unwrap() { @@ -86,7 +92,7 @@ class sycl_backend struct async_waiter_list_t{ std::mutex m_; - std::vector> async_waiters; + std::vector> async_waiters; template void add_waiter(T *t){ @@ -180,12 +186,12 @@ class sycl_backend s.report(execution_info::task_completion); }); }); - return async_waiter{e2, new SelectionHandle(s)}; + return async_waiter{e2, std::make_shared(s)}; } else if constexpr(report_value_v){ if (use_event_profiling) { - auto waiter = async_waiter{e1, new SelectionHandle(s)}; + auto waiter = async_waiter{e1,std::make_shared(s)}; async_waiter_list.add_waiter(new async_waiter(waiter)); return waiter; } @@ -196,13 +202,13 @@ class sycl_backend s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count()); }); }); - return async_waiter{e2, new SelectionHandle(s)}; + return async_waiter{e2, std::make_shared(s)}; } } } else { - return async_waiter{f(unwrap(s), std::forward(args)...), new SelectionHandle(s)}; + return async_waiter{f(unwrap(s), std::forward(args)...), std::make_shared(s)}; } } From e58f2640f09a9a3411ab6cf33a0b6926bcd268a0 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Mon, 15 Apr 2024 11:44:00 -0500 Subject: [PATCH 13/30] Added an erase and remove_if --- .../dynamic_selection_impl/sycl_backend.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index c88238459df..030ce47bfdc 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace oneapi { @@ -102,13 +103,13 @@ class sycl_backend void lazy_report(){ std::lock_guard l(m_); - int size = async_waiters.size(); - for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){ - if(i->get()->is_complete()){ - i->get()->report(); - async_waiters.erase(i); - } - } + async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::shared_ptr& async_waiter){ + if(async_waiter->is_complete()){ + async_waiter->report(); + return true; + } + return false; + }), async_waiters.end()); } }; From e134770e1db5e191e8e3dea016a65a72c382397f Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Wed, 24 Apr 2024 16:10:31 -0500 Subject: [PATCH 14/30] Adressing comments for profiling report, Adding profiling to default sycl backend constructor --- .../dynamic_selection_impl/auto_tune_policy.h | 18 ++--- .../dynamic_selection_impl/backend_traits.h | 17 +---- .../dynamic_load_policy.h | 6 +- .../dynamic_selection_impl/sycl_backend.h | 68 ++++++++++++------- 4 files changed, 58 insertions(+), 51 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index f551ecb77dc..cc74f845075 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,9 @@ class auto_tune_policy using size_type = typename std::vector::size_type; using timing_t = uint64_t; + using report_clock_type = std::chrono::steady_clock; + using report_duration = std::chrono::duration; + static constexpr timing_t never_resample = 0; static constexpr size_type use_best_resource = ~size_type(0); @@ -67,7 +71,7 @@ class auto_tune_policy { std::mutex m_; - std::chrono::steady_clock::time_point t0_; + report_clock_type::time_point t0_; timing_t best_timing_ = std::numeric_limits::max(); resource_with_index_t best_resource_; @@ -81,7 +85,7 @@ class auto_tune_policy timing_t resample_time_ = 0.0; tuner_t(resource_with_index_t br, size_type resources_size, timing_t rt) - : t0_(std::chrono::steady_clock::now()), best_resource_(br), max_resource_to_profile_(resources_size), + : t0_(report_clock_type::now()), best_resource_(br), max_resource_to_profile_(resources_size), resample_time_(rt) { } @@ -101,8 +105,8 @@ class auto_tune_policy } else { - auto now = std::chrono::steady_clock::now(); - auto ms = std::chrono::duration_cast(now - t0_).count(); + auto now = report_clock_type::now(); + auto ms = report_duration(now - t0_).count(); if (ms < resample_time_) { return use_best_resource; @@ -218,10 +222,8 @@ class auto_tune_policy select(Function&& f, Args&&... args) { static_assert(sizeof...(KeyArgs) == sizeof...(Args)); - if constexpr(backend_traits::lazy_report_v && backend_traits::enable_profiling_v){ - if(backend_->has_enable_profiling == true){ - backend_->lazy_report(); - } + if constexpr(backend_traits::lazy_report_v){ + backend_->lazy_report(); } if (state_) { diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h index 91181ea2dd8..b50a5b3a27c 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h @@ -10,7 +10,9 @@ #ifndef _ONEDPL_INTERNAL_BACKEND_TRAITS_H #define _ONEDPL_INTERNAL_BACKEND_TRAITS_H -#include "oneapi/dpl/internal/dynamic_selection_traits.h" +#include +#include +#include namespace oneapi { @@ -33,12 +35,6 @@ namespace internal { }; - template - struct has_enable_profiling : std::false_type { }; - - // Specialization for U = int - template - struct has_enable_profiling : std::true_type { }; }//namespace internal namespace backend_traits { @@ -50,13 +46,6 @@ namespace backend_traits { template inline constexpr bool lazy_report_v = lazy_report_value::value; - template - struct enable_profiling_value - { - static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling::value; - }; - template - inline constexpr bool enable_profiling_v = enable_profiling_value::value; } //namespace backend_traits } // namespace experimental diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h index 0886facb02b..9a7e60cd6fb 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h @@ -150,10 +150,8 @@ struct dynamic_load_policy selection_type select(Args&&...) { - if constexpr(backend_traits::lazy_report_v && backend_traits::enable_profiling_v){ - if(backend_->has_enable_profiling == true){ - backend_->lazy_report(); - } + if constexpr(backend_traits::lazy_report_v){ + backend_->lazy_report(); } if (state_) { diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 030ce47bfdc..d9750e23159 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -16,6 +16,7 @@ #include "oneapi/dpl/internal/dynamic_selection_impl/scoring_policy_defs.h" #include +#include #include #include #include @@ -36,7 +37,10 @@ class sycl_backend using execution_resource_t = resource_type; using resource_container_t = std::vector; - bool has_enable_profiling = false; + using report_clock_type = std::chrono::steady_clock; + using report_duration = std::chrono::duration; + + static inline bool is_profiling_enabled = false; private: class async_waiter_base{ public: @@ -56,11 +60,6 @@ class sycl_backend async_waiter(sycl::event e, std::shared_ptr selection) : e_(e), s(selection) {} async_waiter(async_waiter &w) : e_(w.e_), s(w.s) {} - async_waiter& operator=(async_waiter &w){ - s = w.s; - e_ = w.e_; - return *this; - } sycl::event unwrap() { @@ -78,9 +77,12 @@ class sycl_backend if constexpr (report_value_v){ cl_ulong time_start = e_.template get_profiling_info(); cl_ulong time_end = e_.template get_profiling_info(); - s->report(execution_info::task_time, time_end-time_start); + if(s!=nullptr){ + s->report(execution_info::task_time, report_duration(time_end-time_start).count()); + } } + } bool @@ -93,7 +95,7 @@ class sycl_backend struct async_waiter_list_t{ std::mutex m_; - std::vector> async_waiters; + std::vector> async_waiters; template void add_waiter(T *t){ @@ -102,14 +104,16 @@ class sycl_backend } void lazy_report(){ - std::lock_guard l(m_); - async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::shared_ptr& async_waiter){ - if(async_waiter->is_complete()){ - async_waiter->report(); - return true; - } - return false; - }), async_waiters.end()); + if(is_profiling_enabled){ + std::lock_guard l(m_); + async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::unique_ptr& async_waiter){ + if(async_waiter->is_complete()){ + async_waiter->report(); + return true; + } + return false; + }), async_waiters.end()); + } } }; @@ -147,14 +151,16 @@ class sycl_backend template sycl_backend(const NativeUniverseVector& v) { + bool profiling = true; global_rank_.reserve(v.size()); for (auto e : v) { global_rank_.push_back(e); - if(e.template has_property()){ - has_enable_profiling = true; + if(!e.template has_property()){ + profiling = false; } } + is_profiling_enabled = profiling; sgroup_ptr_ = std::make_unique(global_rank_); } @@ -170,13 +176,12 @@ class sycl_backend if constexpr (report_info_v || report_value_v) { - std::chrono::steady_clock::time_point t0; - bool use_event_profiling = q.template has_property(); + report_clock_type::time_point t0; if constexpr (report_value_v) { - if (!use_event_profiling) + if (!is_profiling_enabled) { - t0 = std::chrono::steady_clock::now(); + t0 = report_clock_type::now(); } } auto e1 = f(q, std::forward(args)...); @@ -189,8 +194,8 @@ class sycl_backend }); return async_waiter{e2, std::make_shared(s)}; } - else if constexpr(report_value_v){ - if (use_event_profiling) + if constexpr(report_value_v){ + if (is_profiling_enabled) { auto waiter = async_waiter{e1,std::make_shared(s)}; async_waiter_list.add_waiter(new async_waiter(waiter)); @@ -200,7 +205,7 @@ class sycl_backend auto e2 = q.submit([=](sycl::handler& h){ h.depends_on(e1); h.host_task([=](){ - s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count()); + s.report(execution_info::task_time, (report_clock_type::now() - t0).count()); }); }); return async_waiter{e2, std::make_shared(s)}; @@ -236,10 +241,23 @@ class sycl_backend void initialize_default_resources() { + bool profiling = true; + auto prop_list = sycl::property_list{}; auto devices = sycl::device::get_devices(); for (auto x : devices) { global_rank_.push_back(sycl::queue{x}); + if(!x.has(sycl::aspect::queue_profiling)){ + profiling = false; + } + } + is_profiling_enabled = profiling; + if(is_profiling_enabled){ + prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; + } + for (auto x : devices) + { + global_rank_.push_back(sycl::queue{x, prop_list}); } } }; From 6ce3b5ea21a954bc10e4b61c7bb272b0d6bc0ffe Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Thu, 25 Apr 2024 11:16:17 -0500 Subject: [PATCH 15/30] Changes to std::chrono::duration --- .../internal/dynamic_selection_impl/auto_tune_policy.h | 6 +++--- .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index cc74f845075..6688ffc6e02 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -50,7 +50,7 @@ class auto_tune_policy using timing_t = uint64_t; using report_clock_type = std::chrono::steady_clock; - using report_duration = std::chrono::duration; + using report_duration = std::chrono::milliseconds; static constexpr timing_t never_resample = 0; static constexpr size_type use_best_resource = ~size_type(0); @@ -105,8 +105,8 @@ class auto_tune_policy } else { - auto now = report_clock_type::now(); - auto ms = report_duration(now - t0_).count(); + const auto now = report_clock_type::now(); + const auto ms = std::chrono::duration_cast(now - t0_).count(); if (ms < resample_time_) { return use_best_resource; diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index d9750e23159..deaf31a03cf 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -38,7 +38,7 @@ class sycl_backend using resource_container_t = std::vector; using report_clock_type = std::chrono::steady_clock; - using report_duration = std::chrono::duration; + using report_duration = std::chrono::milliseconds; static inline bool is_profiling_enabled = false; private: @@ -78,7 +78,8 @@ class sycl_backend cl_ulong time_start = e_.template get_profiling_info(); cl_ulong time_end = e_.template get_profiling_info(); if(s!=nullptr){ - s->report(execution_info::task_time, report_duration(time_end-time_start).count()); + const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start); + s->report(execution_info::task_time, std::chrono::duration_cast(duration_in_ns).count()); } } @@ -205,7 +206,8 @@ class sycl_backend auto e2 = q.submit([=](sycl::handler& h){ h.depends_on(e1); h.host_task([=](){ - s.report(execution_info::task_time, (report_clock_type::now() - t0).count()); + const auto tp_now = report_clock_type::now(); + s.report(execution_info::task_time, std::chrono::duration_cast(tp_now - t0).count()); }); }); return async_waiter{e2, std::make_shared(s)}; From a8ad96bb3a4f5c1156999f1c34e9d5cf7487ae5e Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Thu, 25 Apr 2024 14:43:39 -0500 Subject: [PATCH 16/30] Addressed comments for sycl backend --- .../dynamic_selection_impl/sycl_backend.h | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index deaf31a03cf..7473c8f8f79 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -44,7 +44,6 @@ class sycl_backend private: class async_waiter_base{ public: - virtual void wait() = 0; virtual void report() = 0; virtual bool is_complete() = 0; virtual ~async_waiter_base() = default; @@ -56,10 +55,9 @@ class sycl_backend sycl::event e_; std::shared_ptr s; public: - async_waiter(sycl::event e) : e_(e) {} + async_waiter() = default; async_waiter(sycl::event e, std::shared_ptr selection) : e_(e), s(selection) {} - async_waiter(async_waiter &w) : e_(w.e_), s(w.s) {} sycl::event unwrap() { @@ -67,7 +65,7 @@ class sycl_backend } void - wait() override + wait() { e_.wait(); } @@ -75,9 +73,9 @@ class sycl_backend void report() override{ if constexpr (report_value_v){ - cl_ulong time_start = e_.template get_profiling_info(); - cl_ulong time_end = e_.template get_profiling_info(); if(s!=nullptr){ + cl_ulong time_start = e_.template get_profiling_info(); + cl_ulong time_end = e_.template get_profiling_info(); const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start); s->report(execution_info::task_time, std::chrono::duration_cast(duration_in_ns).count()); } @@ -98,10 +96,9 @@ class sycl_backend std::mutex m_; std::vector> async_waiters; - template - void add_waiter(T *t){ + void add_waiter(async_waiter_base *t){ std::lock_guard l(m_); - async_waiters.push_back(std::unique_ptr(t)); + async_waiters.push_back(std::unique_ptr(t)); } void lazy_report(){ @@ -185,6 +182,7 @@ class sycl_backend t0 = report_clock_type::now(); } } + async_waiter waiter; auto e1 = f(q, std::forward(args)...); if constexpr(report_info_v){ auto e2 = q.submit([=](sycl::handler& h){ @@ -193,14 +191,13 @@ class sycl_backend s.report(execution_info::task_completion); }); }); - return async_waiter{e2, std::make_shared(s)}; + waiter = async_waiter{e2, std::make_shared(s)}; } if constexpr(report_value_v){ if (is_profiling_enabled) { - auto waiter = async_waiter{e1,std::make_shared(s)}; + waiter = async_waiter{e1,std::make_shared(s)}; async_waiter_list.add_waiter(new async_waiter(waiter)); - return waiter; } else{ auto e2 = q.submit([=](sycl::handler& h){ @@ -210,9 +207,10 @@ class sycl_backend s.report(execution_info::task_time, std::chrono::duration_cast(tp_now - t0).count()); }); }); - return async_waiter{e2, std::make_shared(s)}; + waiter = async_waiter{e2, std::make_shared(s)}; } } + return waiter; } else { @@ -246,9 +244,8 @@ class sycl_backend bool profiling = true; auto prop_list = sycl::property_list{}; auto devices = sycl::device::get_devices(); - for (auto x : devices) + for (auto& x : devices) { - global_rank_.push_back(sycl::queue{x}); if(!x.has(sycl::aspect::queue_profiling)){ profiling = false; } @@ -257,7 +254,7 @@ class sycl_backend if(is_profiling_enabled){ prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; } - for (auto x : devices) + for (auto& x : devices) { global_rank_.push_back(sycl::queue{x, prop_list}); } From b70aeecdc933ec42c3fff526d06fb96b113806ca Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 26 Apr 2024 10:29:54 -0500 Subject: [PATCH 17/30] Addressed comments to sycl backend --- .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 7473c8f8f79..a053bdd5a7e 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -74,8 +74,8 @@ class sycl_backend report() override{ if constexpr (report_value_v){ if(s!=nullptr){ - cl_ulong time_start = e_.template get_profiling_info(); - cl_ulong time_end = e_.template get_profiling_info(); + const auto time_start = e_.template get_profiling_info(); + const auto time_end = e_.template get_profiling_info(); const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start); s->report(execution_info::task_time, std::chrono::duration_cast(duration_in_ns).count()); } @@ -214,7 +214,7 @@ class sycl_backend } else { - return async_waiter{f(unwrap(s), std::forward(args)...), std::make_shared(s)}; + return async_waiter{f(q, std::forward(args)...), std::make_shared(s)}; } } From 60710e2a909d18cb4c4771f577df25000698119d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 26 Apr 2024 17:33:29 +0200 Subject: [PATCH 18/30] Fix USM memory leaks and create unique task names (#1537) * test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp - fix USM shared memory leaks Signed-off-by: Sergey Kopienko * test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp - create unique task names for h.single_task([](){}); Signed-off-by: Sergey Kopienko --------- Signed-off-by: Sergey Kopienko --- .../sycl/test_auto_tune_policy_sycl.pass.cpp | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp index 23702af4c02..09809a8492f 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp @@ -14,6 +14,8 @@ #include #include "support/test_dynamic_selection_utils.h" #include "support/utils.h" +#include "support/sycl_alloc_utils.h" + #if TEST_DYNAMIC_SELECTION_AVAILABLE int @@ -53,8 +55,11 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) using my_policy_t = Policy; // they are cpus so this is ok - double* v = sycl::malloc_shared(1000000, u[0]); - int* j = sycl::malloc_shared(1, u[0]); + TestUtils::usm_data_transfer dt_helper_v(u[0], 1000000); + TestUtils::usm_data_transfer dt_helper_j(u[0], 1); + + double* v = dt_helper_v.get_data(); + int* j = dt_helper_j.get_data(); my_policy_t p{u}; auto n_samples = u.size(); @@ -102,7 +107,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) if (*j == 0) { return q.submit([=](sycl::handler& h){ - h.single_task([](){}); + h.single_task([](){}); }); } else @@ -148,7 +153,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) if (*j == 0) { return q.submit([=](sycl::handler& h){ - h.single_task([](){}); + h.single_task([](){}); }); } else @@ -189,8 +194,11 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource) using my_policy_t = Policy; // they are cpus so this is ok - double* v = sycl::malloc_shared(1000000, u[0]); - int* j = sycl::malloc_shared(1, u[0]); + TestUtils::usm_data_transfer dt_helper_v(u[0], 1000000); + TestUtils::usm_data_transfer dt_helper_j(u[0], 1); + + double* v = dt_helper_v.get_data(); + int* j = dt_helper_j.get_data(); my_policy_t p{u}; auto n_samples = u.size(); @@ -238,7 +246,7 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource) if (*j == 0) { return q.submit([=](sycl::handler& h){ - h.single_task([](){}); + h.single_task([](){}); }); } else @@ -284,7 +292,7 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource) if (*j == 0) { return q.submit([=](sycl::handler& h){ - h.single_task([](){}); + h.single_task([](){}); }); } else @@ -326,8 +334,11 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) using my_policy_t = Policy; // they are cpus so this is ok - double* v = sycl::malloc_shared(1000000, u[0]); - int* j = sycl::malloc_shared(1, u[0]); + TestUtils::usm_data_transfer dt_helper_v(u[0], 1000000); + TestUtils::usm_data_transfer dt_helper_j(u[0], 1); + + double* v = dt_helper_v.get_data(); + int* j = dt_helper_j.get_data(); my_policy_t p{u}; auto n_samples = u.size(); @@ -375,7 +386,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) if (*j == 0) { return q.submit([=](sycl::handler& h){ - h.single_task([](){}); + h.single_task([](){}); }); } else @@ -420,7 +431,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) if (*j == 0) { return q.submit([=](sycl::handler& h){ - h.single_task([](){}); + h.single_task([](){}); }); } else From 1e76278bd749d27012b35188df7bbd81336fd9c7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 26 Apr 2024 18:36:53 +0200 Subject: [PATCH 19/30] Fix the format of report method - using `report_duration` in second param (#1539) * include/oneapi/dpl/internal/dynamic_selection_traits.h - fix traits to specify report method second parameter type Signed-off-by: Sergey Kopienko * Change second parameter type of auto_tune_selection_type::report method - to report_duration (std::chrono::milliseconds) Signed-off-by: Sergey Kopienko --------- Signed-off-by: Sergey Kopienko --- .../dynamic_selection_impl/auto_tune_policy.h | 4 ++-- .../dynamic_selection_impl/sycl_backend.h | 16 +++++++-------- .../dpl/internal/dynamic_selection_traits.h | 20 +++++++++---------- test/support/inline_backend.h | 12 ++++++----- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 6688ffc6e02..a56260eaa22 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -174,9 +174,9 @@ class auto_tune_policy }; void - report(const execution_info::task_time_t&, const typename execution_info::task_time_t::value_type& v) const + report(const execution_info::task_time_t&, report_duration v) const { - tuner_->add_new_timing(resource_, v); + tuner_->add_new_timing(resource_, v.count()); } }; diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index a053bdd5a7e..18f811fd764 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -72,12 +72,12 @@ class sycl_backend void report() override{ - if constexpr (report_value_v){ + if constexpr (report_value_v){ if(s!=nullptr){ const auto time_start = e_.template get_profiling_info(); const auto time_end = e_.template get_profiling_info(); - const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start); - s->report(execution_info::task_time, std::chrono::duration_cast(duration_in_ns).count()); + s->report(execution_info::task_time, std::chrono::duration_cast( + std::chrono::nanoseconds(time_end - time_start))); } } @@ -172,10 +172,10 @@ class sycl_backend report(s, execution_info::task_submission); } if constexpr (report_info_v || - report_value_v) + report_value_v) { report_clock_type::time_point t0; - if constexpr (report_value_v) + if constexpr (report_value_v) { if (!is_profiling_enabled) { @@ -193,7 +193,7 @@ class sycl_backend }); waiter = async_waiter{e2, std::make_shared(s)}; } - if constexpr(report_value_v){ + if constexpr(report_value_v){ if (is_profiling_enabled) { waiter = async_waiter{e1,std::make_shared(s)}; @@ -203,8 +203,8 @@ class sycl_backend auto e2 = q.submit([=](sycl::handler& h){ h.depends_on(e1); h.host_task([=](){ - const auto tp_now = report_clock_type::now(); - s.report(execution_info::task_time, std::chrono::duration_cast(tp_now - t0).count()); + s.report(execution_info::task_time, + std::chrono::duration_cast(report_clock_type::now() - t0)); }); }); waiter = async_waiter{e2, std::make_shared(s)}; diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index c1955a86c42..47e36c8cba4 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -108,16 +108,16 @@ struct has_report : decltype(has_report_impl(0)) { }; -template +template auto has_report_value_impl(...) -> std::false_type; -template +template auto -has_report_value_impl(int) -> decltype(std::declval().report(std::declval(), 0), std::true_type{}); +has_report_value_impl(int) -> decltype(std::declval().report(std::declval(), std::declval()), std::true_type{}); -template -struct has_report_value : decltype(has_report_value_impl(0)) +template +struct has_report_value : decltype(has_report_value_impl(0)) { }; @@ -277,7 +277,7 @@ template void report(S&& s, const Info& i, const Value& v) { - if constexpr (internal::has_report_value::value) + if constexpr (internal::has_report_value::value) { std::forward(s).report(i, v); } @@ -291,13 +291,13 @@ struct report_info template inline constexpr bool report_info_v = report_info::value; -template +template struct report_value { - static constexpr bool value = internal::has_report_value::value; + static constexpr bool value = internal::has_report_value::value; }; -template -inline constexpr bool report_value_v = report_value::value; +template +inline constexpr bool report_value_v = report_value::value; } // namespace experimental } // namespace dpl diff --git a/test/support/inline_backend.h b/test/support/inline_backend.h index 301dfdf1315..24c0e7cfeaa 100644 --- a/test/support/inline_backend.h +++ b/test/support/inline_backend.h @@ -14,6 +14,7 @@ #include #include +#include namespace TestUtils { @@ -50,6 +51,7 @@ class int_inline_backend_t using wait_type = int; using execution_resource_t = basic_execution_resource_t; using resource_container_t = std::vector; + using report_duration = std::chrono::milliseconds; private: using native_resource_container_t = std::vector; @@ -99,8 +101,8 @@ class int_inline_backend_t submit(SelectionHandle s, Function&& f, Args&&... args) { std::chrono::steady_clock::time_point t0; - if constexpr (oneapi::dpl::experimental::report_value_v) + if constexpr (oneapi::dpl::experimental::report_value_v< + SelectionHandle, oneapi::dpl::experimental::execution_info::task_time_t, report_duration>) { t0 = std::chrono::steady_clock::now(); } @@ -116,11 +118,11 @@ class int_inline_backend_t { oneapi::dpl::experimental::report(s, oneapi::dpl::experimental::execution_info::task_completion); } - if constexpr (oneapi::dpl::experimental::report_value_v) + if constexpr (oneapi::dpl::experimental::report_value_v< + SelectionHandle, oneapi::dpl::experimental::execution_info::task_time_t, report_duration>) { report(s, oneapi::dpl::experimental::execution_info::task_time, - (std::chrono::steady_clock::now() - t0).count()); + std::chrono::duration_cast(std::chrono::steady_clock::now() - t0)); } return async_waiter{w}; } From c217e1a37466afbf48f3b6bf014547ead92ef053 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 26 Apr 2024 12:26:43 -0500 Subject: [PATCH 20/30] Fixing clang format --- .../dynamic_selection_impl/auto_tune_policy.h | 3 +- .../dynamic_selection_impl/backend_traits.h | 47 ++++---- .../dynamic_load_policy.h | 3 +- .../dynamic_selection_impl/sycl_backend.h | 106 +++++++++++------- .../dpl/internal/dynamic_selection_traits.h | 3 +- 5 files changed, 94 insertions(+), 68 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index a56260eaa22..1e6be9b1344 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -222,7 +222,8 @@ class auto_tune_policy select(Function&& f, Args&&... args) { static_assert(sizeof...(KeyArgs) == sizeof...(Args)); - if constexpr(backend_traits::lazy_report_v){ + if constexpr (backend_traits::lazy_report_v) + { backend_->lazy_report(); } if (state_) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h index b50a5b3a27c..5d085b24e4f 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h @@ -22,29 +22,30 @@ namespace experimental { namespace internal { - template - auto - has_lazy_report_impl(...) -> std::false_type; - - template - auto - has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true_type{}); - - template - struct has_lazy_report : decltype(has_lazy_report_impl(0)) - { - }; - -}//namespace internal - -namespace backend_traits { - template - struct lazy_report_value - { - static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report::value; - }; - template - inline constexpr bool lazy_report_v = lazy_report_value::value; +template +auto +has_lazy_report_impl(...) -> std::false_type; + +template +auto +has_lazy_report_impl(int) -> decltype(std::declval().lazy_report(), std::true_type{}); + +template +struct has_lazy_report : decltype(has_lazy_report_impl(0)) +{ +}; + +} //namespace internal + +namespace backend_traits +{ +template +struct lazy_report_value +{ + static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report::value; +}; +template +inline constexpr bool lazy_report_v = lazy_report_value::value; } //namespace backend_traits diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h index 9a7e60cd6fb..d8d656a0594 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h @@ -150,7 +150,8 @@ struct dynamic_load_policy selection_type select(Args&&...) { - if constexpr(backend_traits::lazy_report_v){ + if constexpr (backend_traits::lazy_report_v) + { backend_->lazy_report(); } if (state_) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 18f811fd764..a213fc03336 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -41,19 +41,24 @@ class sycl_backend using report_duration = std::chrono::milliseconds; static inline bool is_profiling_enabled = false; + private: - class async_waiter_base{ - public: - virtual void report() = 0; - virtual bool is_complete() = 0; - virtual ~async_waiter_base() = default; + class async_waiter_base + { + public: + virtual void + report() = 0; + virtual bool + is_complete() = 0; + virtual ~async_waiter_base() = default; }; - template + template class async_waiter : public async_waiter_base { sycl::event e_; std::shared_ptr s; + public: async_waiter() = default; async_waiter(sycl::event e, std::shared_ptr selection) : e_(e), s(selection) {} @@ -71,53 +76,64 @@ class sycl_backend } void - report() override{ - if constexpr (report_value_v){ - if(s!=nullptr){ - const auto time_start = e_.template get_profiling_info(); + report() override + { + if constexpr (report_value_v) + { + if (s != nullptr) + { + const auto time_start = + e_.template get_profiling_info(); const auto time_end = e_.template get_profiling_info(); s->report(execution_info::task_time, std::chrono::duration_cast( std::chrono::nanoseconds(time_end - time_start))); } } - - } bool - is_complete() override{ - return e_.get_info() == sycl::info::event_command_status::complete; + is_complete() override + { + return e_.get_info() == + sycl::info::event_command_status::complete; } - }; - struct async_waiter_list_t{ + struct async_waiter_list_t + { std::mutex m_; std::vector> async_waiters; - void add_waiter(async_waiter_base *t){ + void + add_waiter(async_waiter_base* t) + { std::lock_guard l(m_); async_waiters.push_back(std::unique_ptr(t)); } - void lazy_report(){ - if(is_profiling_enabled){ + void + lazy_report() + { + if (is_profiling_enabled) + { std::lock_guard l(m_); - async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::unique_ptr& async_waiter){ - if(async_waiter->is_complete()){ - async_waiter->report(); - return true; - } - return false; - }), async_waiters.end()); + async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), + [](std::unique_ptr& async_waiter) { + if (async_waiter->is_complete()) + { + async_waiter->report(); + return true; + } + return false; + }), + async_waiters.end()); } } }; async_waiter_list_t async_waiter_list; - class submission_group { resource_container_t resources_; @@ -154,7 +170,8 @@ class sycl_backend for (auto e : v) { global_rank_.push_back(e); - if(!e.template has_property()){ + if (!e.template has_property()) + { profiling = false; } } @@ -184,25 +201,26 @@ class sycl_backend } async_waiter waiter; auto e1 = f(q, std::forward(args)...); - if constexpr(report_info_v){ - auto e2 = q.submit([=](sycl::handler& h){ + if constexpr (report_info_v) + { + auto e2 = q.submit([=](sycl::handler& h) { h.depends_on(e1); - h.host_task([=](){ - s.report(execution_info::task_completion); - }); + h.host_task([=]() { s.report(execution_info::task_completion); }); }); - waiter = async_waiter{e2, std::make_shared(s)}; + waiter = async_waiter{e2, std::make_shared(s)}; } - if constexpr(report_value_v){ + if constexpr (report_value_v) + { if (is_profiling_enabled) { - waiter = async_waiter{e1,std::make_shared(s)}; + waiter = async_waiter{e1, std::make_shared(s)}; async_waiter_list.add_waiter(new async_waiter(waiter)); } - else{ - auto e2 = q.submit([=](sycl::handler& h){ + else + { + auto e2 = q.submit([=](sycl::handler& h) { h.depends_on(e1); - h.host_task([=](){ + h.host_task([=]() { s.report(execution_info::task_time, std::chrono::duration_cast(report_clock_type::now() - t0)); }); @@ -230,7 +248,9 @@ class sycl_backend return global_rank_; } - void lazy_report(){ + void + lazy_report() + { async_waiter_list.lazy_report(); } @@ -246,12 +266,14 @@ class sycl_backend auto devices = sycl::device::get_devices(); for (auto& x : devices) { - if(!x.has(sycl::aspect::queue_profiling)){ + if (!x.has(sycl::aspect::queue_profiling)) + { profiling = false; } } is_profiling_enabled = profiling; - if(is_profiling_enabled){ + if (is_profiling_enabled) + { prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; } for (auto& x : devices) diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h index 47e36c8cba4..bd6f79ba898 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h @@ -114,7 +114,8 @@ has_report_value_impl(...) -> std::false_type; template auto -has_report_value_impl(int) -> decltype(std::declval().report(std::declval(), std::declval()), std::true_type{}); +has_report_value_impl(int) + -> decltype(std::declval().report(std::declval(), std::declval()), std::true_type{}); template struct has_report_value : decltype(has_report_value_impl(0)) From 75dd2263e0a188f6df06f14b382c14bda836adac Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 26 Apr 2024 15:11:15 -0500 Subject: [PATCH 21/30] Adding header file for backend_traits --- .../dpl/internal/dynamic_selection_impl/dynamic_load_policy.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h index d8d656a0594..c34b92e1956 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h @@ -19,6 +19,7 @@ #include #include #include "oneapi/dpl/internal/dynamic_selection_traits.h" +#include "oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h" #if _DS_BACKEND_SYCL != 0 # include "oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h" #endif From bc95cd296be548ec17d02840f8fd5b695539b7ae Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 26 Apr 2024 16:21:53 -0500 Subject: [PATCH 22/30] Changed structure for sycl_backend submit --- .../dynamic_selection_impl/sycl_backend.h | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index a213fc03336..292b87cb4f9 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -199,34 +199,41 @@ class sycl_backend t0 = report_clock_type::now(); } } - async_waiter waiter; auto e1 = f(q, std::forward(args)...); - if constexpr (report_info_v) - { - auto e2 = q.submit([=](sycl::handler& h) { - h.depends_on(e1); - h.host_task([=]() { s.report(execution_info::task_completion); }); - }); - waiter = async_waiter{e2, std::make_shared(s)}; - } + async_waiter waiter{e1, std::make_shared(s)}; + + if constexpr (report_value_v) { if (is_profiling_enabled) { - waiter = async_waiter{e1, std::make_shared(s)}; async_waiter_list.add_waiter(new async_waiter(waiter)); } - else - { - auto e2 = q.submit([=](sycl::handler& h) { - h.depends_on(e1); - h.host_task([=]() { - s.report(execution_info::task_time, + } + + bool is_host_task_needed = report_value_v && !is_profiling_enabled + || report_info_v; + + if (is_host_task_needed) + { + auto e2 = q.submit([=](sycl::handler& h) { + h.depends_on(e1); + h.host_task([=]() { + if constexpr (report_value_v) + { + if(!is_profiling_enabled) + { + s.report(execution_info::task_time, std::chrono::duration_cast(report_clock_type::now() - t0)); - }); + } + } + if constexpr (report_info_v) + { + s.report(execution_info::task_completion); + } }); - waiter = async_waiter{e2, std::make_shared(s)}; - } + }); + waiter = async_waiter{e2, std::make_shared(s)}; } return waiter; } From 19e78e711b8794169661acfae0770bcc66a9e4e1 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Fri, 26 Apr 2024 16:59:25 -0500 Subject: [PATCH 23/30] Fixed clang format --- .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 292b87cb4f9..49fe0ac6fef 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -202,7 +202,6 @@ class sycl_backend auto e1 = f(q, std::forward(args)...); async_waiter waiter{e1, std::make_shared(s)}; - if constexpr (report_value_v) { if (is_profiling_enabled) @@ -211,8 +210,9 @@ class sycl_backend } } - bool is_host_task_needed = report_value_v && !is_profiling_enabled - || report_info_v; + bool is_host_task_needed = report_value_v && + !is_profiling_enabled || + report_info_v; if (is_host_task_needed) { @@ -221,10 +221,10 @@ class sycl_backend h.host_task([=]() { if constexpr (report_value_v) { - if(!is_profiling_enabled) + if (!is_profiling_enabled) { s.report(execution_info::task_time, - std::chrono::duration_cast(report_clock_type::now() - t0)); + std::chrono::duration_cast(report_clock_type::now() - t0)); } } if constexpr (report_info_v) From 3d3dab2968b7efa950c7a037b501f582155ababb Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Mon, 29 Apr 2024 11:28:24 -0500 Subject: [PATCH 24/30] Changes to make variables is_profiling_enabled and aliases for report function private --- .../dpl/internal/dynamic_selection_impl/backend_traits.h | 1 - .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h index 5d085b24e4f..26ce70171f7 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h @@ -11,7 +11,6 @@ #define _ONEDPL_INTERNAL_BACKEND_TRAITS_H #include -#include #include namespace oneapi diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 49fe0ac6fef..e83823a94ac 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -37,12 +37,13 @@ class sycl_backend using execution_resource_t = resource_type; using resource_container_t = std::vector; - using report_clock_type = std::chrono::steady_clock; - using report_duration = std::chrono::milliseconds; + + private: static inline bool is_profiling_enabled = false; + using report_clock_type = std::chrono::steady_clock; + using report_duration = std::chrono::milliseconds; - private: class async_waiter_base { public: From 80096690b1f6e45b7f9e19f05a55f890ac8b89b1 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Mon, 29 Apr 2024 13:40:09 -0500 Subject: [PATCH 25/30] Moved is_profiling_enabled outside the async_waiter class --- .../dynamic_selection_impl/sycl_backend.h | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index e83823a94ac..5c376251c46 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -37,9 +37,7 @@ class sycl_backend using execution_resource_t = resource_type; using resource_container_t = std::vector; - private: - static inline bool is_profiling_enabled = false; using report_clock_type = std::chrono::steady_clock; using report_duration = std::chrono::milliseconds; @@ -116,20 +114,18 @@ class sycl_backend void lazy_report() { - if (is_profiling_enabled) - { - std::lock_guard l(m_); - async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), - [](std::unique_ptr& async_waiter) { - if (async_waiter->is_complete()) - { - async_waiter->report(); - return true; - } - return false; - }), - async_waiters.end()); - } + std::lock_guard l(m_); + async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), + [](std::unique_ptr& async_waiter) { + if (async_waiter->is_complete()) + { + async_waiter->report(); + return true; + } + return false; + }), + async_waiters.end()); + } }; @@ -259,7 +255,9 @@ class sycl_backend void lazy_report() { - async_waiter_list.lazy_report(); + if (is_profiling_enabled){ + async_waiter_list.lazy_report(); + } } private: From ae46b12a943803b2083464ca8ba78c894c7ff559 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Mon, 29 Apr 2024 14:51:49 -0500 Subject: [PATCH 26/30] Making sycl backend submit function more readable --- .../dynamic_selection_impl/sycl_backend.h | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 5c376251c46..54e79e5abf9 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -180,16 +180,19 @@ class sycl_backend auto submit(SelectionHandle s, Function&& f, Args&&... args) { + constexpr bool report_task_completion = report_info_v; + constexpr bool report_task_submission = report_info_v; + constexpr bool report_task_time = report_value_v; + auto q = unwrap(s); - if constexpr (report_info_v) + if constexpr (report_task_submission) { report(s, execution_info::task_submission); } - if constexpr (report_info_v || - report_value_v) + if constexpr (report_task_completion || report_task_time) { report_clock_type::time_point t0; - if constexpr (report_value_v) + if constexpr (report_task_time) { if (!is_profiling_enabled) { @@ -199,7 +202,7 @@ class sycl_backend auto e1 = f(q, std::forward(args)...); async_waiter waiter{e1, std::make_shared(s)}; - if constexpr (report_value_v) + if constexpr (report_task_time) { if (is_profiling_enabled) { @@ -207,16 +210,14 @@ class sycl_backend } } - bool is_host_task_needed = report_value_v && - !is_profiling_enabled || - report_info_v; + bool is_host_task_needed = report_task_time && !is_profiling_enabled || report_task_completion; if (is_host_task_needed) { auto e2 = q.submit([=](sycl::handler& h) { h.depends_on(e1); h.host_task([=]() { - if constexpr (report_value_v) + if constexpr (report_task_time) { if (!is_profiling_enabled) { @@ -224,7 +225,7 @@ class sycl_backend std::chrono::duration_cast(report_clock_type::now() - t0)); } } - if constexpr (report_info_v) + if constexpr (report_task_completion) { s.report(execution_info::task_completion); } From 0cf0daab98663ccf69998806363193bfc01a15d0 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Mon, 29 Apr 2024 16:00:25 -0500 Subject: [PATCH 27/30] Fixed clang format --- .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 54e79e5abf9..59637b8d24c 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -125,7 +125,6 @@ class sycl_backend return false; }), async_waiters.end()); - } }; @@ -182,7 +181,7 @@ class sycl_backend { constexpr bool report_task_completion = report_info_v; constexpr bool report_task_submission = report_info_v; - constexpr bool report_task_time = report_value_v; + constexpr bool report_task_time = report_value_v; auto q = unwrap(s); if constexpr (report_task_submission) @@ -256,7 +255,8 @@ class sycl_backend void lazy_report() { - if (is_profiling_enabled){ + if (is_profiling_enabled) + { async_waiter_list.lazy_report(); } } From 5028e631e4640528392cb8ce8f92cf7727dbf026 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 30 Apr 2024 17:30:21 +0200 Subject: [PATCH 28/30] Simplify sycl_backend::submit(SelectionHandle s, Function&& f, Args&&... args) function (#1547) * include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h - simplify sycl_backend::submit(SelectionHandle s, Function&& f, Args&&... args) function Signed-off-by: Sergey Kopienko * include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h - remove not required type name Signed-off-by: Sergey Kopienko --------- Signed-off-by: Sergey Kopienko --- .../dynamic_selection_impl/sycl_backend.h | 30 +++++-------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 59637b8d24c..a99d4bcb3a0 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -184,34 +184,24 @@ class sycl_backend constexpr bool report_task_time = report_value_v; auto q = unwrap(s); + if constexpr (report_task_submission) - { report(s, execution_info::task_submission); - } + if constexpr (report_task_completion || report_task_time) { - report_clock_type::time_point t0; - if constexpr (report_task_time) - { - if (!is_profiling_enabled) - { - t0 = report_clock_type::now(); - } - } + const auto t0 = report_clock_type::now(); + auto e1 = f(q, std::forward(args)...); async_waiter waiter{e1, std::make_shared(s)}; if constexpr (report_task_time) { if (is_profiling_enabled) - { async_waiter_list.add_waiter(new async_waiter(waiter)); - } } - bool is_host_task_needed = report_task_time && !is_profiling_enabled || report_task_completion; - - if (is_host_task_needed) + if (report_task_time && !is_profiling_enabled || report_task_completion) { auto e2 = q.submit([=](sycl::handler& h) { h.depends_on(e1); @@ -219,25 +209,19 @@ class sycl_backend if constexpr (report_task_time) { if (!is_profiling_enabled) - { s.report(execution_info::task_time, std::chrono::duration_cast(report_clock_type::now() - t0)); - } } if constexpr (report_task_completion) - { s.report(execution_info::task_completion); - } }); }); waiter = async_waiter{e2, std::make_shared(s)}; } return waiter; } - else - { - return async_waiter{f(q, std::forward(args)...), std::make_shared(s)}; - } + + return async_waiter{f(q, std::forward(args)...), std::make_shared(s)}; } auto From aa07b29f8184dbc2723c76b4fb5b6b7adb764d8f Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Tue, 30 Apr 2024 10:41:37 -0500 Subject: [PATCH 29/30] Minor changes to auto_tune_policy.h and sycl_backend.h --- .../internal/dynamic_selection_impl/auto_tune_policy.h | 2 +- .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h index 1e6be9b1344..9fc901977ac 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h @@ -58,7 +58,7 @@ class auto_tune_policy struct resource_with_index_t { wrapped_resource_t r_; - size_type index_; + size_type index_ = 0; }; struct time_data_t diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index a99d4bcb3a0..ae5d5a1e008 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -46,9 +46,9 @@ class sycl_backend { public: virtual void - report() = 0; + report() const = 0; virtual bool - is_complete() = 0; + is_complete() const = 0; virtual ~async_waiter_base() = default; }; @@ -75,7 +75,7 @@ class sycl_backend } void - report() override + report() const override { if constexpr (report_value_v) { @@ -91,7 +91,7 @@ class sycl_backend } bool - is_complete() override + is_complete() const override { return e_.get_info() == sycl::info::event_command_status::complete; From 797ea84356c4b462e503b55064be93a24c23befc Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Tue, 30 Apr 2024 10:48:41 -0500 Subject: [PATCH 30/30] Improving readability --- .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index ae5d5a1e008..52df9966946 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -45,10 +45,8 @@ class sycl_backend class async_waiter_base { public: - virtual void - report() const = 0; - virtual bool - is_complete() const = 0; + virtual void report() const = 0; + virtual bool is_complete() const = 0; virtual ~async_waiter_base() = default; };