From 9417ddb0b5770a206d411d0eac5a24b0ffa08fdd Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Tue, 9 Jan 2024 15:15:30 -0600
Subject: [PATCH 01/30] Added static assert for keyArgs==Args

---
 .../dpl/internal/dynamic_selection_impl/auto_tune_policy.h       | 1 +
 1 file changed, 1 insertion(+)
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index c6b247e1abd..c5f95c83013 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -241,6 +241,7 @@ class auto_tune_policy
     auto
     submit(selection_type e, Function&& f, Args&&... args)
     {
+        static_assert(sizeof...(KeyArgs) == sizeof...(Args));
         if (backend_)
         {
             return backend_->submit(e, std::forward<Function>(f), std::forward<Args>(args)...);

From 7e54e28c41b1463f0226f6b0ef38f83398e0624b Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Wed, 10 Jan 2024 13:23:21 -0600
Subject: [PATCH 02/30] WIP for jit compilation

---
 .../dynamic_selection_impl/auto_tune_policy.h |   3 +
 .../dynamic_selection_impl/sycl_backend.h     |  20 +-
 ...est_auto_tune_policy_sycl_profile.pass.cpp | 536 ++++++++++++++++++
 3 files changed, 557 insertions(+), 2 deletions(-)
 create mode 100644 test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index c5f95c83013..ac49493fd84 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -124,6 +124,7 @@ class auto_tune_policy
             {
                 // ignore the 1st timing to cover for JIT compilation
                 time_by_index_[index] = time_data_t{0, std::numeric_limits<timing_t>::max()};
+                std::cout<<"This timing is ignored\n";
             }
             else
             {
@@ -132,6 +133,7 @@ class auto_tune_policy
                 new_value = (n * td.value_ + t) / (n + 1);
                 td.num_timings_ = n + 1;
                 td.value_ = new_value;
+
             }
             if (new_value < best_timing_)
             {
@@ -221,6 +223,7 @@ class auto_tune_policy
             auto k = make_task_key(std::forward<Function>(f), std::forward<Args>(args)...);
             auto t = state_->tuner_by_key_[k];
             auto index = t->get_resource_to_profile();
+            std::cout<<"Device used : "<<index<<"\n";
             if (index == use_best_resource)
             {
                 return selection_type{*this, t->best_resource_, t};
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 62746edf0d8..dd06775b310 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -106,9 +106,13 @@ class sycl_backend
                       report_value_v<SelectionHandle, execution_info::task_time_t>)
         {
             std::chrono::steady_clock::time_point t0;
+            bool use_event_profiling = q.template has_property<sycl::property::queue::enable_profiling>();
             if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
             {
-                t0 = std::chrono::steady_clock::now();
+                if (!use_event_profiling)
+                {
+                    t0 = std::chrono::steady_clock::now();
+                }
             }
             auto e1 = f(q, std::forward<Args>(args)...);
             auto e2 = q.submit([=](sycl::handler& h) {
@@ -119,7 +123,19 @@ class sycl_backend
                         s.report(execution_info::task_completion);
                     }
                     if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
-                        s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
+                    {
+                        if (use_event_profiling)
+                        {
+                            cl_ulong time_start = e1.template get_profiling_info<sycl::info::event_profiling::command_start>();
+                            cl_ulong time_end = e1.template get_profiling_info<sycl::info::event_profiling::command_end>();
+                            std::cout<<"Total time : "<<time_end-time_start<<"\n";
+                            s.report(execution_info::task_time, time_end - time_start);
+                        }
+                        else
+                        {
+                            s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
+                        }
+                    }
                 });
             });
             return async_waiter{e2};
diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp
new file mode 100644
index 00000000000..229f4bd11c2
--- /dev/null
+++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp
@@ -0,0 +1,536 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <iostream>
+#include <thread>
+#include "oneapi/dpl/dynamic_selection"
+#include "support/test_dynamic_selection_utils.h"
+#include "support/test_config.h"
+#if TEST_DYNAMIC_SELECTION_AVAILABLE
+#    include "support/sycl_sanity.h"
+
+template <bool call_select_before_submit, typename Policy, typename UniverseContainer>
+int
+test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
+{
+    using my_policy_t = Policy;
+
+    int j;
+    std::vector<double> v(1000000, 0.0);
+
+    my_policy_t p{u};
+    auto n_samples = u.size();
+
+    const int N = 10;
+    std::atomic<int> ecount = 0;
+    bool pass = true;
+
+    for (int i = 1; i <= N; ++i)
+    {
+        if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource)
+        {
+            j = 100;
+        }
+        else
+        {
+            j = 0;
+        }
+        const size_t bytes = 1000000 * sizeof(double);
+        if constexpr (call_select_before_submit)
+        {
+            auto f = [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
+                if (i <= 2 * n_samples)
+                {
+                    // we should be round-robining through the resources
+                    if (q != u[(i - 1) % n_samples])
+                    {
+                        std::cout << i << ": mismatch during rr phase\n" << std::flush;
+                        pass = false;
+                    }
+                }
+                else
+                {
+                    if (q != u[best_resource])
+                    {
+                        std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
+                        pass = false;
+                    }
+                }
+                ecount += i;
+                if (j == 0)
+                {
+                     return q.submit([&](sycl::handler& h)
+                     {
+                             h.single_task([=](){});
+                         });
+                }
+                else
+                {
+                    return q.submit([&](sycl::handler& h) {
+                        double *d_v = sycl::malloc_device<double>(1000000, q);
+                        q.memcpy(d_v, v.data(), bytes).wait();
+                        q.parallel_for(
+                            1000000, [=](sycl::id<1> idx) {
+                                for (int j0 = 0; j0 < j; ++j0)
+                                {
+                                    d_v[idx] += idx;
+                                }
+                            });
+                        q.memcpy(v.data(), d_v, bytes).wait();
+                    });
+                }
+            };
+            auto s = oneapi::dpl::experimental::select(p, f);
+            auto e = oneapi::dpl::experimental::submit(s, f);
+            oneapi::dpl::experimental::wait(e);
+        }
+        else
+        {
+            // it's ok to capture by reference since we are waiting on each call
+            auto s = oneapi::dpl::experimental::submit(
+                p, [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
+                    if (i <= 2 * n_samples)
+                    {
+                        // we should be round-robining through the resources
+                        if (q != u[(i - 1) % n_samples])
+                        {
+                            std::cout << i << ": mismatch during rr phase\n" << std::flush;
+                            pass = false;
+                        }
+                    }
+                    else
+                    {
+                        if (q != u[best_resource])
+                        {
+                            std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
+                            pass = false;
+                        }
+                    }
+                    ecount += i;
+                    if (j == 0)
+                    {
+                     return q.submit([&](sycl::handler& h){
+                             h.single_task([=](){});
+                         });
+                    }
+                    else
+                    {
+                        return q.submit([&](sycl::handler& h) {
+                            double *d_v = sycl::malloc_device<double>(1000000, q);
+                            q.memcpy(d_v, v.data(), bytes).wait();
+                            q.parallel_for(
+                                1000000, [=](sycl::id<1> idx) {
+                                    for (int j0 = 0; j0 < j; ++j0)
+                                    {
+                                        d_v[idx] += idx;
+                                    }
+                                });
+                            q.memcpy(v.data(), d_v, bytes).wait();
+                        });
+                    }
+                });
+            oneapi::dpl::experimental::wait(s);
+        }
+
+        int count = ecount.load();
+        if (count != i * (i + 1) / 2)
+        {
+            std::cout << "ERROR: scheduler did not execute all tasks exactly once\n";
+            return 1;
+        }
+    }
+    if (!pass)
+    {
+        std::cout << "ERROR: did not select expected resources\n";
+        return 1;
+    }
+    if constexpr (call_select_before_submit)
+    {
+        std::cout << "select then submit and wait on event: OK\n";
+    }
+    else
+    {
+        std::cout << "submit and wait on event: OK\n";
+    }
+    return 0;
+}
+
+template <bool call_select_before_submit, typename Policy, typename UniverseContainer>
+int
+test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
+{
+    using my_policy_t = Policy;
+
+    int j;
+    std::vector<double> v(1000000, 0.0);
+
+    my_policy_t p{u};
+    auto n_samples = u.size();
+
+    const int N = 10;
+    std::atomic<int> ecount = 0;
+    bool pass = true;
+
+    for (int i = 1; i <= N; ++i)
+    {
+        if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource)
+        {
+            j = 100;
+        }
+        else
+        {
+            j = 0;
+        }
+        const size_t bytes = 1000000 * sizeof(double);
+        if constexpr (call_select_before_submit)
+        {
+            auto f = [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
+                if (i <= 2 * n_samples)
+                {
+                    // we should be round-robining through the resources
+                    if (q != u[(i - 1) % n_samples])
+                    {
+                        std::cout << i << ": mismatch during rr phase\n" << std::flush;
+                        pass = false;
+                    }
+                }
+                else
+                {
+                    if (q != u[best_resource])
+                    {
+                        std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
+                        pass = false;
+                    }
+                }
+                ecount += i;
+                if (j == 0)
+                {
+                     return q.submit([=](sycl::handler& h){
+                            h.single_task([=](){});
+                         });
+                }
+                else
+                {
+                    return q.submit([&](sycl::handler& h) {
+                            double *d_v = sycl::malloc_device<double>(1000000, q);
+                            q.memcpy(d_v, v.data(), bytes).wait();
+                            h.parallel_for(
+                            1000000, [=](sycl::id<1> idx) {
+                                for (int j0 = 0; j0 < j; ++j0)
+                                {
+                                    d_v[idx] += idx;
+                                }
+                            });
+                            q.memcpy(v.data(), d_v, bytes).wait();
+                    });
+                }
+            };
+            auto s = oneapi::dpl::experimental::select(p, f);
+            auto e = oneapi::dpl::experimental::submit(s, f);
+            oneapi::dpl::experimental::wait(p.get_submission_group());
+        }
+        else
+        {
+            // it's ok to capture by reference since we are waiting on each call
+            auto s = oneapi::dpl::experimental::submit(
+                p, [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
+                    if (i <= 2 * n_samples)
+                    {
+                        // we should be round-robining through the resources
+                        if (q != u[(i - 1) % n_samples])
+                        {
+                            std::cout << i << ": mismatch during rr phase\n" << std::flush;
+                            pass = false;
+                        }
+                    }
+                    else
+                    {
+                        if (q != u[best_resource])
+                        {
+                            std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
+                            pass = false;
+                        }
+                    }
+                    ecount += i;
+                    if (j == 0)
+                    {
+                     return q.submit([=](sycl::handler& h){
+                             h.single_task([=](){});
+                         });
+                    }
+                    else
+                    {
+                        return q.submit([&](sycl::handler& h) {
+                            double *d_v = sycl::malloc_device<double>(1000000, q);
+                            q.memcpy(d_v, v.data(), bytes).wait();
+                            h.parallel_for(
+                                1000000, [=](sycl::id<1> idx) {
+                                    for (int j0 = 0; j0 < j; ++j0)
+                                    {
+                                        d_v[idx] += idx;
+                                    }
+                                });
+                            q.memcpy(v.data(), d_v, bytes).wait();
+                        });
+                    }
+                });
+            oneapi::dpl::experimental::wait(p.get_submission_group());
+        }
+
+        int count = ecount.load();
+        if (count != i * (i + 1) / 2)
+        {
+            std::cout << "ERROR: scheduler did not execute all tasks exactly once\n";
+            return 1;
+        }
+    }
+    if (!pass)
+    {
+        std::cout << "ERROR: did not select expected resources\n";
+        return 1;
+    }
+    if constexpr (call_select_before_submit)
+    {
+        std::cout << "select then submit and wait on group: OK\n";
+    }
+    else
+    {
+        std::cout << "submit and wait on group: OK\n";
+    }
+    return 0;
+}
+
+template <bool call_select_before_submit, typename Policy, typename UniverseContainer>
+int
+test_auto_submit_and_wait(UniverseContainer u, int best_resource)
+{
+    using my_policy_t = Policy;
+
+    // they are cpus so this is ok
+    int j;
+    std::vector<double> v(1000000, 0.0);
+
+    my_policy_t p{u};
+    auto n_samples = u.size();
+
+    const int N = 10;
+    std::atomic<int> ecount = 0;
+    bool pass = true;
+
+    for (int i = 1; i <= N; ++i)
+    {
+        if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource)
+        {
+            j = 100;
+        }
+        else
+        {
+            j = 0;
+        }
+        const size_t bytes = 1000000 * sizeof(double);
+        if constexpr (call_select_before_submit)
+        {
+            auto f = [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
+                if (i <= 2 * n_samples)
+                {
+                    // we should be round-robining through the resources
+                    if (q != u[(i - 1) % n_samples])
+                    {
+                        std::cout << i << ": mismatch during rr phase\n" << std::flush;
+                        pass = false;
+                    }
+                }
+                else
+                {
+                    if (q != u[best_resource])
+                    {
+                        std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
+                        pass = false;
+                    }
+                }
+                ecount += i;
+                if (j == 0)
+                {
+                     return q.submit([=](sycl::handler& h){
+                             h.single_task([=](){});
+                         });
+                }
+                else
+                {
+                    return q.submit([&](sycl::handler& h) {
+                        double *d_v = sycl::malloc_device<double>(1000000, q);
+                        q.memcpy(d_v, v.data(), bytes).wait();
+                        h.parallel_for(
+                            1000000, [=](sycl::id<1> idx) {
+                                for (int j0 = 0; j0 < j; ++j0)
+                                {
+                                    d_v[idx] += idx;
+                                }
+                            });
+                            q.memcpy(v.data(), d_v, bytes).wait();
+                    });
+                }
+            };
+            auto s = oneapi::dpl::experimental::select(p, f);
+            oneapi::dpl::experimental::submit_and_wait(s, f);
+        }
+        else
+        {
+            // it's ok to capture by reference since we are waiting on each call
+            oneapi::dpl::experimental::submit_and_wait(
+                p, [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
+                    if (i <= 2 * n_samples)
+                    {
+                        // we should be round-robining through the resources
+                        if (q != u[(i - 1) % n_samples])
+                        {
+                            std::cout << i << ": mismatch during rr phase\n" << std::flush;
+                            pass = false;
+                        }
+                    }
+                    else
+                    {
+                        if (q != u[best_resource])
+                        {
+                            std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
+                            pass = false;
+                        }
+                    }
+                    ecount += i;
+                    if (j == 0)
+                    {
+                     return q.submit([=](sycl::handler& h){
+                             h.single_task([=](){});
+                         });
+                    }
+                    else
+                    {
+                        return q.submit([&](sycl::handler& h) {
+                            double *d_v = sycl::malloc_device<double>(1000000, q);
+                            q.memcpy(d_v, v.data(), bytes).wait();
+                            h.parallel_for(
+                                1000000, [=](sycl::id<1> idx) {
+                                    for (int j0 = 0; j0 < j; ++j0)
+                                    {
+                                        d_v[idx] += idx;
+                                    }
+                                });
+                            q.memcpy(v.data(), d_v, bytes).wait();
+                        });
+                    }
+                });
+        }
+
+        int count = ecount.load();
+        if (count != i * (i + 1) / 2)
+        {
+            std::cout << "ERROR: scheduler did not execute all tasks exactly once\n";
+            return 1;
+        }
+    }
+    if (!pass)
+    {
+        std::cout << "ERROR: did not select expected resources\n";
+        return 1;
+    }
+    if constexpr (call_select_before_submit)
+    {
+        std::cout << "select then submit_and_wait: OK\n";
+    }
+    else
+    {
+        std::cout << "submit_and_wait: OK\n";
+    }
+    return 0;
+}
+
+static inline void
+build_auto_tune_universe1(std::vector<sycl::queue>& u)
+{
+    auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
+    try
+    {
+        auto device_cpu = sycl::device(sycl::cpu_selector_v);
+        sycl::queue cpu_queue{device_cpu, prop_list};
+        run_sycl_sanity_test(cpu_queue);
+        u.push_back(cpu_queue);
+    }
+    catch (const sycl::exception&)
+    {
+        std::cout << "SKIPPED: Unable to run with cpu_selector\n";
+    }
+    try
+    {
+        auto device_gpu = sycl::device(sycl::gpu_selector_v);
+        sycl::queue gpu_queue{device_gpu, prop_list};
+        run_sycl_sanity_test(gpu_queue);
+        u.push_back(gpu_queue);
+    }
+    catch (const sycl::exception&)
+    {
+        std::cout << "SKIPPED: Unable to run with gpu_selector\n";
+    }
+}
+
+#endif
+
+int
+main()
+{
+#if TEST_DYNAMIC_SELECTION_AVAILABLE
+    using policy_t = oneapi::dpl::experimental::auto_tune_policy<oneapi::dpl::experimental::sycl_backend>;
+    std::vector<sycl::queue> u;
+    build_auto_tune_universe1(u);
+
+    //If building the universe is not a success, return
+    if (u.size() == 0 || u.size()==0)
+        return 0;
+
+    constexpr bool just_call_submit = false;
+    constexpr bool call_select_before_submit = true;
+
+    if (test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1)||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1) ||
+        // now select then submits
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 1) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 1) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 1) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 0) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 1) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1))
+    {
+        std::cout << "FAIL\n";
+        return 1;
+    }
+    else
+    {
+        std::cout << "PASS\n";
+        return 0;
+    }
+#else
+    std::cout << "SKIPPED\n";
+    return 0;
+#endif // TEST_DYNAMIC_SELECTION_AVAILABLE
+}

From 268f0d59bd38efdebfd8dc5b6ddaa9d3ba5f2787 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Wed, 17 Jan 2024 13:55:49 -0600
Subject: [PATCH 03/30] Fixing Autotune bug

---
 .../dynamic_selection_impl/auto_tune_policy.h | 11 +++--
 .../dynamic_selection_impl/sycl_backend.h     | 40 ++++++++++++++-----
 ...est_auto_tune_policy_sycl_profile.pass.cpp | 12 +++---
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index ac49493fd84..011b6dba0c0 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -120,21 +120,21 @@ class auto_tune_policy
             std::unique_lock<std::mutex> l(m_);
             auto index = r.index_;
             timing_t new_value = t;
-            if (time_by_index_.count(index) == 0)
+            /*if (time_by_index_.count(index) == 0)
             {
                 // ignore the 1st timing to cover for JIT compilation
                 time_by_index_[index] = time_data_t{0, std::numeric_limits<timing_t>::max()};
                 std::cout<<"This timing is ignored\n";
             }
             else
-            {
+            {*/
                 auto& td = time_by_index_[index];
                 auto n = td.num_timings_;
                 new_value = (n * td.value_ + t) / (n + 1);
                 td.num_timings_ = n + 1;
                 td.value_ = new_value;
 
-            }
+            //}
             if (new_value < best_timing_)
             {
                 best_timing_ = new_value;
@@ -171,6 +171,7 @@ class auto_tune_policy
         void
         report(const execution_info::task_time_t&, const typename execution_info::task_time_t::value_type& v) const
         {
+            std::cout<<"Autotune report : "<<v<<"\n";
             tuner_->add_new_timing(resource_, v);
         }
     };
@@ -223,14 +224,16 @@ class auto_tune_policy
             auto k = make_task_key(std::forward<Function>(f), std::forward<Args>(args)...);
             auto t = state_->tuner_by_key_[k];
             auto index = t->get_resource_to_profile();
-            std::cout<<"Device used : "<<index<<"\n";
+            //std::cout<<"Device used : "<<index<<"\n";
             if (index == use_best_resource)
             {
+                std::cout<<"Testing Device used : "<<t->best_resource_.index_<<"\n";
                 return selection_type{*this, t->best_resource_, t};
             }
             else
             {
                 auto r = state_->resources_with_index_[index];
+                std::cout<<"Trial Device used : "<<index<<"\n";
                 return selection_type{*this, r, t};
             }
         }
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index dd06775b310..48b54ddda43 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include <memory>
 #include <utility>
+#include <atomic>
 
 namespace oneapi
 {
@@ -80,6 +81,7 @@ class sycl_backend
     {
         initialize_default_resources();
         sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
+        number_of_resources=global_rank_.size();
     }
 
     template <typename NativeUniverseVector>
@@ -91,6 +93,7 @@ class sycl_backend
             global_rank_.push_back(e);
         }
         sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
+        number_of_resources=global_rank_.size();
     }
 
     template <typename SelectionHandle, typename Function, typename... Args>
@@ -115,6 +118,7 @@ class sycl_backend
                 }
             }
             auto e1 = f(q, std::forward<Args>(args)...);
+        break;
             auto e2 = q.submit([=](sycl::handler& h) {
                 h.depends_on(e1);
                 h.host_task([=]() {
@@ -124,18 +128,35 @@ class sycl_backend
                     }
                     if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
                     {
-                        if (use_event_profiling)
-                        {
-                            cl_ulong time_start = e1.template get_profiling_info<sycl::info::event_profiling::command_start>();
-                            cl_ulong time_end = e1.template get_profiling_info<sycl::info::event_profiling::command_end>();
-                            std::cout<<"Total time : "<<time_end-time_start<<"\n";
-                            s.report(execution_info::task_time, time_end - time_start);
+                        if(number_of_resources==0){
+                            if (use_event_profiling)
+                            {
+                                try{
+                                    cl_ulong time_start = e1.template get_profiling_info<sycl::info::event_profiling::command_start>();
+                                //std::cout<<" Time start : "<<time_start<<"\n";
+                                    cl_ulong time_end = e1.template get_profiling_info<sycl::info::event_profiling::command_end>();
+
+                                //std::cout<<"Time end : "<<time_end<<"\n";
+                                //std::cout<<"Total time : "<<time_end-time_start<<"\n";
+                                s.report(execution_info::task_time, time_end - time_start);
+                                }catch(std::exception &e){
+                                    std::cout<<e.what();
+                                    break;
+                                }
+                            }
+                            else
+                            {
+
+                                s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
+                            }
                         }
-                        else
-                        {
-                            s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
+                        else{
+                            std::cout<<"Timing ignored\n";
+                            number_of_resources--;
                         }
+
                     }
+                    std::cout<<"Exit\n";
                 });
             });
             return async_waiter{e2};
@@ -159,6 +180,7 @@ class sycl_backend
     }
 
   private:
+    std::atomic<int> number_of_resources;;
     resource_container_t global_rank_;
     std::unique_ptr<submission_group> sgroup_ptr_;
 
diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp
index 229f4bd11c2..b8cc1f686f0 100644
--- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp
+++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp
@@ -115,12 +115,14 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                     ecount += i;
                     if (j == 0)
                     {
+                     std::cout<<"Device 0\n";
                      return q.submit([&](sycl::handler& h){
                              h.single_task([=](){});
                          });
                     }
                     else
                     {
+                        std::cout<<"Device 1\n";
                         return q.submit([&](sycl::handler& h) {
                             double *d_v = sycl::malloc_device<double>(1000000, q);
                             q.memcpy(d_v, v.data(), bytes).wait();
@@ -451,7 +453,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
 }
 
 static inline void
-build_auto_tune_universe1(std::vector<sycl::queue>& u)
+build_auto_tune_universe(std::vector<sycl::queue>& u)
 {
     auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
     try
@@ -486,7 +488,7 @@ main()
 #if TEST_DYNAMIC_SELECTION_AVAILABLE
     using policy_t = oneapi::dpl::experimental::auto_tune_policy<oneapi::dpl::experimental::sycl_backend>;
     std::vector<sycl::queue> u;
-    build_auto_tune_universe1(u);
+    build_auto_tune_universe(u);
 
     //If building the universe is not a success, return
     if (u.size() == 0 || u.size()==0)
@@ -495,7 +497,7 @@ main()
     constexpr bool just_call_submit = false;
     constexpr bool call_select_before_submit = true;
 
-    if (test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) ||
+    if (test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) /*||
         test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1) ||
         test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) ||
         test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1) ||
@@ -503,7 +505,7 @@ main()
         test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1) ||
         test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 0) ||
         test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1) ||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
+         test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
         test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1)||
         test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
         test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1) ||
@@ -519,7 +521,7 @@ main()
         test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0) ||
         test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1) ||
         test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1))
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1)*/)
     {
         std::cout << "FAIL\n";
         return 1;

From 0dda71219458770ebc612051d9dc181d0121cd45 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Thu, 21 Mar 2024 14:08:16 -0500
Subject: [PATCH 04/30] Added sycl profiling without host task

---
 .../dynamic_selection_impl/auto_tune_policy.h |  31 +-
 .../dynamic_selection_impl/sycl_backend.h     | 120 ++--
 .../dpl/internal/dynamic_selection_traits.h   |  25 +
 .../sycl/test_auto_tune_policy_sycl.pass.cpp  | 180 +++---
 ...est_auto_tune_policy_sycl_profile.pass.cpp | 538 ------------------
 test/support/test_dynamic_selection_utils.h   |   1 +
 6 files changed, 226 insertions(+), 669 deletions(-)
 delete mode 100644 test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 011b6dba0c0..215e50fc72d 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -120,21 +120,12 @@ class auto_tune_policy
             std::unique_lock<std::mutex> l(m_);
             auto index = r.index_;
             timing_t new_value = t;
-            /*if (time_by_index_.count(index) == 0)
-            {
-                // ignore the 1st timing to cover for JIT compilation
-                time_by_index_[index] = time_data_t{0, std::numeric_limits<timing_t>::max()};
-                std::cout<<"This timing is ignored\n";
-            }
-            else
-            {*/
-                auto& td = time_by_index_[index];
-                auto n = td.num_timings_;
-                new_value = (n * td.value_ + t) / (n + 1);
-                td.num_timings_ = n + 1;
-                td.value_ = new_value;
-
-            //}
+            auto& td = time_by_index_[index];
+            auto n = td.num_timings_;
+            new_value = (n * td.value_ + t) / (n + 1);
+            td.num_timings_ = n + 1;
+            td.value_ = new_value;
+
             if (new_value < best_timing_)
             {
                 best_timing_ = new_value;
@@ -153,8 +144,7 @@ class auto_tune_policy
       public:
         auto_tune_selection_type(const policy_t& p, resource_with_index_t r, std::shared_ptr<tuner_t> t)
             : policy_(p), resource_(r), tuner_(::std::move(t))
-        {
-        }
+        {}
 
         auto
         unwrap()
@@ -171,7 +161,6 @@ class auto_tune_policy
         void
         report(const execution_info::task_time_t&, const typename execution_info::task_time_t::value_type& v) const
         {
-            std::cout<<"Autotune report : "<<v<<"\n";
             tuner_->add_new_timing(resource_, v);
         }
     };
@@ -224,16 +213,16 @@ class auto_tune_policy
             auto k = make_task_key(std::forward<Function>(f), std::forward<Args>(args)...);
             auto t = state_->tuner_by_key_[k];
             auto index = t->get_resource_to_profile();
-            //std::cout<<"Device used : "<<index<<"\n";
             if (index == use_best_resource)
             {
-                std::cout<<"Testing Device used : "<<t->best_resource_.index_<<"\n";
                 return selection_type{*this, t->best_resource_, t};
             }
             else
             {
+                if constexpr(has_lazy_report<Backend>::value){
+                        backend_->lazy_report();
+                }
                 auto r = state_->resources_with_index_[index];
-                std::cout<<"Trial Device used : "<<index<<"\n";
                 return selection_type{*this, r, t};
             }
         }
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 48b54ddda43..279696cddc3 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -37,24 +37,66 @@ class sycl_backend
     using resource_container_t = std::vector<execution_resource_t>;
 
   private:
-    class async_waiter
+    class storage_base{
+        public:
+            virtual void wait() = 0;
+            virtual void report() = 0;
+            virtual bool is_complete() = 0;
+    };
+
+    std::vector<storage_base*> storage_arr;
+
+    template<typename T>
+    void addStorage(T *t){
+        storage_arr.push_back(t);
+    }
+
+    template<typename Selection>
+    class async_waiter : public storage_base
     {
         sycl::event e_;
+        Selection* s;
+        std::optional<std::chrono::steady_clock::time_point> timing;
 
       public:
-        async_waiter(sycl::event e) : e_(e) {}
+        async_waiter(sycl::event e) : e_(e){}
+        async_waiter(sycl::event e, Selection* selection, std::optional<std::chrono::steady_clock::time_point> t=std::nullopt) : e_(e), s(selection), timing(t) {}
         sycl::event
         unwrap()
         {
             return e_;
         }
+
         void
-        wait()
+        wait() override
         {
             e_.wait();
         }
+
+        void
+        report() override{
+            if constexpr (report_value_v<Selection, execution_info::task_time_t>){
+                if (!timing.has_value())
+                {
+                    cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
+                    cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
+                    s->report(execution_info::task_time, time_end-time_start);
+                }else{
+                    auto t = timing.value();
+                    s->report(execution_info::task_time, (std::chrono::steady_clock::now() - t).count());
+                }
+            }
+
+        }
+
+        bool
+        is_complete() override{
+            return e_.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete;
+        }
+
     };
 
+
     class submission_group
     {
         resource_container_t resources_;
@@ -93,7 +135,6 @@ class sycl_backend
             global_rank_.push_back(e);
         }
         sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
-        number_of_resources=global_rank_.size();
     }
 
     template <typename SelectionHandle, typename Function, typename... Args>
@@ -118,52 +159,32 @@ class sycl_backend
                 }
             }
             auto e1 = f(q, std::forward<Args>(args)...);
-        break;
-            auto e2 = q.submit([=](sycl::handler& h) {
-                h.depends_on(e1);
-                h.host_task([=]() {
-                    if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)
-                    {
+            if constexpr(report_info_v<SelectionHandle, execution_info::task_completion_t>){
+                auto e2 = q.submit([=](sycl::handler& h){
+                    h.depends_on(e1);
+                    h.host_task([=](){
                         s.report(execution_info::task_completion);
-                    }
-                    if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
-                    {
-                        if(number_of_resources==0){
-                            if (use_event_profiling)
-                            {
-                                try{
-                                    cl_ulong time_start = e1.template get_profiling_info<sycl::info::event_profiling::command_start>();
-                                //std::cout<<" Time start : "<<time_start<<"\n";
-                                    cl_ulong time_end = e1.template get_profiling_info<sycl::info::event_profiling::command_end>();
-
-                                //std::cout<<"Time end : "<<time_end<<"\n";
-                                //std::cout<<"Total time : "<<time_end-time_start<<"\n";
-                                s.report(execution_info::task_time, time_end - time_start);
-                                }catch(std::exception &e){
-                                    std::cout<<e.what();
-                                    break;
-                                }
-                            }
-                            else
-                            {
-
-                                s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
-                            }
-                        }
-                        else{
-                            std::cout<<"Timing ignored\n";
-                            number_of_resources--;
-                        }
-
-                    }
-                    std::cout<<"Exit\n";
+                    });
                 });
-            });
-            return async_waiter{e2};
+                return async_waiter{e2, new SelectionHandle(s)};
+            }
+            if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
+                if (use_event_profiling)
+                {
+                    auto waiter = async_waiter{e1, new SelectionHandle(s)};
+                    addStorage(new async_waiter(waiter));
+                    return waiter;
+                }
+                else{
+                    auto waiter = async_waiter{e1, new SelectionHandle(s), t0};
+                    addStorage(new async_waiter(waiter));
+                    return waiter;
+                }
+            }
         }
         else
         {
-            return async_waiter{f(unwrap(s), std::forward<Args>(args)...)};
+            return async_waiter{f(unwrap(s), std::forward<Args>(args)...), new SelectionHandle(s)};
         }
     }
 
@@ -179,6 +200,15 @@ class sycl_backend
         return global_rank_;
     }
 
+    void lazy_report(){
+        int size = storage_arr.size();
+        for(auto i = storage_arr.begin(); i!=storage_arr.begin()+size; i++){
+            if((*i)->is_complete()){
+                (*i)->report();
+                storage_arr.erase(i);
+            }
+        }
+    }
   private:
     std::atomic<int> number_of_resources;;
     resource_container_t global_rank_;
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index c1955a86c42..f61a0e29461 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -121,6 +121,18 @@ struct has_report_value : decltype(has_report_value_impl<S, Info>(0))
 {
 };
 
+template <typename Backend>
+auto
+has_lazy_report_impl(...) -> std::false_type;
+
+template <typename Backend>
+auto
+has_lazy_report_impl() -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
+
+template <typename Backend>
+struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
+{
+};
 } //namespace internal
 
 struct deferred_initialization_t
@@ -299,6 +311,19 @@ struct report_value
 template <typename S, typename Info>
 inline constexpr bool report_value_v = report_value<S, Info>::value;
 
+template <typename T>
+auto
+has_lazy_report_impl(...) -> std::false_type;
+
+template <typename T>
+auto
+has_lazy_report_impl(int) -> decltype(std::declval<T>().lazy_report(), std::true_type{});
+
+template <typename T>
+struct has_lazy_report : decltype(has_lazy_report_impl<T>(0))
+{
+};
+
 } // namespace experimental
 } // namespace dpl
 } // namespace oneapi
diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
index 85bfd27f931..56d4ebae1b7 100644
--- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
+++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
@@ -108,11 +108,13 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                     }
                 }
                 ecount += i;
-                if (*j == 0) 
+                if (*j == 0)
                 {
-                     return sycl::event{};
-                } 
-                else 
+                     return q.submit([=](sycl::handler& h){
+                        h.single_task([](){});
+                     });
+                }
+                else
                 {
                     return q.submit([=](sycl::handler& h) {
                         h.parallel_for<TestUtils::unique_kernel_name<
@@ -153,13 +155,17 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                         }
                     }
                     ecount += i;
-                    if (*j == 0) 
+                    if (*j == 0)
                     {
-                         return sycl::event{};
-                    } 
-                    else 
+                         auto x = q.submit([=](sycl::handler& h){
+                            h.single_task([](){});
+                         });
+                         return x;
+
+                    }
+                    else
                     {
-                        return q.submit([=](sycl::handler& h) {
+                        auto x = q.submit([=](sycl::handler& h) {
                             h.parallel_for<TestUtils::unique_kernel_name<
                                 class tune2, TestUtils::uniq_kernel_index<sycl::usm::alloc::shared>()>>(
                                 1000000, [=](sycl::id<1> idx) {
@@ -169,6 +175,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                                     }
                                 });
                         });
+                        return x;
                     }
                 });
             oneapi::dpl::experimental::wait(s);
@@ -250,11 +257,13 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
                     }
                 }
                 ecount += i;
-                if (*j == 0) 
+                if (*j == 0)
                 {
-                     return sycl::event{};
-                } 
-                else 
+                     return q.submit([=](sycl::handler& h){
+                        h.single_task([](){});
+                     });
+                }
+                else
                 {
                     return q.submit([=](sycl::handler& h) {
                         h.parallel_for<TestUtils::unique_kernel_name<
@@ -295,11 +304,13 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
                         }
                     }
                     ecount += i;
-                    if (*j == 0) 
+                    if (*j == 0)
                     {
-                         return sycl::event{};
-                    } 
-                    else 
+                         return q.submit([=](sycl::handler& h){
+                            h.single_task([](){});
+                         });
+                    }
+                    else
                     {
                         return q.submit([=](sycl::handler& h) {
                             h.parallel_for<TestUtils::unique_kernel_name<
@@ -392,11 +403,13 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                     }
                 }
                 ecount += i;
-                if (*j == 0) 
+                if (*j == 0)
                 {
-                     return sycl::event{};
-                } 
-                else 
+                     return q.submit([=](sycl::handler& h){
+                        h.single_task([](){});
+                     });
+                }
+                else
                 {
                     return q.submit([=](sycl::handler& h) {
                         h.parallel_for<TestUtils::unique_kernel_name<
@@ -436,13 +449,16 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                         }
                     }
                     ecount += i;
-                    if (*j == 0) 
+                    if (*j == 0)
                     {
-                         return sycl::event{};
-                    } 
-                    else 
+                         auto x = q.submit([=](sycl::handler& h){
+                            h.single_task([](){});
+                         });
+                        return x;
+                    }
+                    else
                     {
-                        return q.submit([=](sycl::handler& h) {
+                        auto x =  q.submit([=](sycl::handler& h) {
                             h.parallel_for<TestUtils::unique_kernel_name<
                                 class tune6, TestUtils::uniq_kernel_index<sycl::usm::alloc::shared>()>>(
                                 1000000, [=](sycl::id<1> idx) {
@@ -452,6 +468,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                                     }
                                 });
                         });
+                        return x;
                     }
                 });
         }
@@ -479,14 +496,20 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
     return 0;
 }
 
+
+template<bool use_event_profiling=false>
 static inline void
 build_auto_tune_universe(std::vector<sycl::queue>& u)
 {
+    auto prop_list = sycl::property_list{};
+    if(use_event_profiling){
+        prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
+    }
+
     try
     {
         auto device_cpu1 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu1_queue(device_cpu1);
-        run_sycl_sanity_test(cpu1_queue);
+        sycl::queue cpu1_queue{device_cpu1, prop_list};
         u.push_back(cpu1_queue);
     }
     catch (const sycl::exception&)
@@ -496,8 +519,7 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
     try
     {
         auto device_cpu2 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu2_queue(device_cpu2);
-        run_sycl_sanity_test(cpu2_queue);
+        sycl::queue cpu2_queue{device_cpu2, prop_list};
         u.push_back(cpu2_queue);
     }
     catch (const sycl::exception&)
@@ -507,8 +529,7 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
     try
     {
         auto device_cpu3 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu3_queue(device_cpu3);
-        run_sycl_sanity_test(cpu3_queue);
+        sycl::queue cpu3_queue{device_cpu3, prop_list};
         u.push_back(cpu3_queue);
     }
     catch (const sycl::exception&)
@@ -518,8 +539,7 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
     try
     {
         auto device_cpu4 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu4_queue(device_cpu4);
-        run_sycl_sanity_test(cpu4_queue);
+        sycl::queue cpu4_queue{device_cpu4, prop_list};
         u.push_back(cpu4_queue);
     }
     catch (const sycl::exception&)
@@ -527,6 +547,7 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
         std::cout << "SKIPPED: Unable to run with cpu_selector\n";
     }
 }
+
 #endif
 
 int
@@ -534,49 +555,78 @@ main()
 {
 #if TEST_DYNAMIC_SELECTION_AVAILABLE
     using policy_t = oneapi::dpl::experimental::auto_tune_policy<oneapi::dpl::experimental::sycl_backend>;
-    std::vector<sycl::queue> u;
-    build_auto_tune_universe(u);
+    std::vector<sycl::queue> u1;
+    std::vector<sycl::queue> u2;
+    constexpr bool use_event_profiling = true;
+    build_auto_tune_universe(u1);
+    build_auto_tune_universe<use_event_profiling>(u2);
 
     //If building the universe is not a success, return
-    if (u.size() == 0)
+    if (u1.size() == 0)
         return 0;
 
-    auto f = [u](int i) {
+    auto f = [u1](int i) {
         if (i <= 8)
-            return u[(i - 1) % 4];
+            return u1[(i - 1) % 4];
         else
-            return u[0];
+            return u1[0];
     };
 
     constexpr bool just_call_submit = false;
     constexpr bool call_select_before_submit = true;
 
-    if (test_auto_initialization(u) || test_select<policy_t, decltype(u), const decltype(f)&, true>(u, f) ||
-        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 2) ||
-        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 3) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 2) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 3) ||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1) ||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 2) ||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 3)
+    if (test_auto_initialization(u1) ||  test_select<policy_t, decltype(u1), const decltype(f)&, true>(u1, f) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 0) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 1) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 2) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 3) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 0) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 1) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 2) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 3) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 0) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 1) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 2) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 3) ||
+        // now select then submits
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 0) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 1) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 2) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 3) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 0) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 1) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 2) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 3) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 0) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 1) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 2) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 3) ||
+        //Use event profiling
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 0) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 1) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 2) ||
+        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 3) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 0) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 1) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 2) ||
+        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 3) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 0) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 1) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 2) ||
+        test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 3) ||
         // now select then submits
-        || test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 2) ||
-        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 3) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 2) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 3) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 2) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 3))
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 0) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 1) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 2) ||
+        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 3) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 0) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 1) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 2) ||
+        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 3) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 0) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 1) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 2) ||
+        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 3))
     {
         std::cout << "FAIL\n";
         return 1;
diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp
deleted file mode 100644
index b8cc1f686f0..00000000000
--- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl_profile.pass.cpp
+++ /dev/null
@@ -1,538 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Copyright (C) 2023 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <iostream>
-#include <thread>
-#include "oneapi/dpl/dynamic_selection"
-#include "support/test_dynamic_selection_utils.h"
-#include "support/test_config.h"
-#if TEST_DYNAMIC_SELECTION_AVAILABLE
-#    include "support/sycl_sanity.h"
-
-template <bool call_select_before_submit, typename Policy, typename UniverseContainer>
-int
-test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
-{
-    using my_policy_t = Policy;
-
-    int j;
-    std::vector<double> v(1000000, 0.0);
-
-    my_policy_t p{u};
-    auto n_samples = u.size();
-
-    const int N = 10;
-    std::atomic<int> ecount = 0;
-    bool pass = true;
-
-    for (int i = 1; i <= N; ++i)
-    {
-        if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource)
-        {
-            j = 100;
-        }
-        else
-        {
-            j = 0;
-        }
-        const size_t bytes = 1000000 * sizeof(double);
-        if constexpr (call_select_before_submit)
-        {
-            auto f = [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
-                if (i <= 2 * n_samples)
-                {
-                    // we should be round-robining through the resources
-                    if (q != u[(i - 1) % n_samples])
-                    {
-                        std::cout << i << ": mismatch during rr phase\n" << std::flush;
-                        pass = false;
-                    }
-                }
-                else
-                {
-                    if (q != u[best_resource])
-                    {
-                        std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
-                        pass = false;
-                    }
-                }
-                ecount += i;
-                if (j == 0)
-                {
-                     return q.submit([&](sycl::handler& h)
-                     {
-                             h.single_task([=](){});
-                         });
-                }
-                else
-                {
-                    return q.submit([&](sycl::handler& h) {
-                        double *d_v = sycl::malloc_device<double>(1000000, q);
-                        q.memcpy(d_v, v.data(), bytes).wait();
-                        q.parallel_for(
-                            1000000, [=](sycl::id<1> idx) {
-                                for (int j0 = 0; j0 < j; ++j0)
-                                {
-                                    d_v[idx] += idx;
-                                }
-                            });
-                        q.memcpy(v.data(), d_v, bytes).wait();
-                    });
-                }
-            };
-            auto s = oneapi::dpl::experimental::select(p, f);
-            auto e = oneapi::dpl::experimental::submit(s, f);
-            oneapi::dpl::experimental::wait(e);
-        }
-        else
-        {
-            // it's ok to capture by reference since we are waiting on each call
-            auto s = oneapi::dpl::experimental::submit(
-                p, [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
-                    if (i <= 2 * n_samples)
-                    {
-                        // we should be round-robining through the resources
-                        if (q != u[(i - 1) % n_samples])
-                        {
-                            std::cout << i << ": mismatch during rr phase\n" << std::flush;
-                            pass = false;
-                        }
-                    }
-                    else
-                    {
-                        if (q != u[best_resource])
-                        {
-                            std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
-                            pass = false;
-                        }
-                    }
-                    ecount += i;
-                    if (j == 0)
-                    {
-                     std::cout<<"Device 0\n";
-                     return q.submit([&](sycl::handler& h){
-                             h.single_task([=](){});
-                         });
-                    }
-                    else
-                    {
-                        std::cout<<"Device 1\n";
-                        return q.submit([&](sycl::handler& h) {
-                            double *d_v = sycl::malloc_device<double>(1000000, q);
-                            q.memcpy(d_v, v.data(), bytes).wait();
-                            q.parallel_for(
-                                1000000, [=](sycl::id<1> idx) {
-                                    for (int j0 = 0; j0 < j; ++j0)
-                                    {
-                                        d_v[idx] += idx;
-                                    }
-                                });
-                            q.memcpy(v.data(), d_v, bytes).wait();
-                        });
-                    }
-                });
-            oneapi::dpl::experimental::wait(s);
-        }
-
-        int count = ecount.load();
-        if (count != i * (i + 1) / 2)
-        {
-            std::cout << "ERROR: scheduler did not execute all tasks exactly once\n";
-            return 1;
-        }
-    }
-    if (!pass)
-    {
-        std::cout << "ERROR: did not select expected resources\n";
-        return 1;
-    }
-    if constexpr (call_select_before_submit)
-    {
-        std::cout << "select then submit and wait on event: OK\n";
-    }
-    else
-    {
-        std::cout << "submit and wait on event: OK\n";
-    }
-    return 0;
-}
-
-template <bool call_select_before_submit, typename Policy, typename UniverseContainer>
-int
-test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
-{
-    using my_policy_t = Policy;
-
-    int j;
-    std::vector<double> v(1000000, 0.0);
-
-    my_policy_t p{u};
-    auto n_samples = u.size();
-
-    const int N = 10;
-    std::atomic<int> ecount = 0;
-    bool pass = true;
-
-    for (int i = 1; i <= N; ++i)
-    {
-        if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource)
-        {
-            j = 100;
-        }
-        else
-        {
-            j = 0;
-        }
-        const size_t bytes = 1000000 * sizeof(double);
-        if constexpr (call_select_before_submit)
-        {
-            auto f = [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
-                if (i <= 2 * n_samples)
-                {
-                    // we should be round-robining through the resources
-                    if (q != u[(i - 1) % n_samples])
-                    {
-                        std::cout << i << ": mismatch during rr phase\n" << std::flush;
-                        pass = false;
-                    }
-                }
-                else
-                {
-                    if (q != u[best_resource])
-                    {
-                        std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
-                        pass = false;
-                    }
-                }
-                ecount += i;
-                if (j == 0)
-                {
-                     return q.submit([=](sycl::handler& h){
-                            h.single_task([=](){});
-                         });
-                }
-                else
-                {
-                    return q.submit([&](sycl::handler& h) {
-                            double *d_v = sycl::malloc_device<double>(1000000, q);
-                            q.memcpy(d_v, v.data(), bytes).wait();
-                            h.parallel_for(
-                            1000000, [=](sycl::id<1> idx) {
-                                for (int j0 = 0; j0 < j; ++j0)
-                                {
-                                    d_v[idx] += idx;
-                                }
-                            });
-                            q.memcpy(v.data(), d_v, bytes).wait();
-                    });
-                }
-            };
-            auto s = oneapi::dpl::experimental::select(p, f);
-            auto e = oneapi::dpl::experimental::submit(s, f);
-            oneapi::dpl::experimental::wait(p.get_submission_group());
-        }
-        else
-        {
-            // it's ok to capture by reference since we are waiting on each call
-            auto s = oneapi::dpl::experimental::submit(
-                p, [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
-                    if (i <= 2 * n_samples)
-                    {
-                        // we should be round-robining through the resources
-                        if (q != u[(i - 1) % n_samples])
-                        {
-                            std::cout << i << ": mismatch during rr phase\n" << std::flush;
-                            pass = false;
-                        }
-                    }
-                    else
-                    {
-                        if (q != u[best_resource])
-                        {
-                            std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
-                            pass = false;
-                        }
-                    }
-                    ecount += i;
-                    if (j == 0)
-                    {
-                     return q.submit([=](sycl::handler& h){
-                             h.single_task([=](){});
-                         });
-                    }
-                    else
-                    {
-                        return q.submit([&](sycl::handler& h) {
-                            double *d_v = sycl::malloc_device<double>(1000000, q);
-                            q.memcpy(d_v, v.data(), bytes).wait();
-                            h.parallel_for(
-                                1000000, [=](sycl::id<1> idx) {
-                                    for (int j0 = 0; j0 < j; ++j0)
-                                    {
-                                        d_v[idx] += idx;
-                                    }
-                                });
-                            q.memcpy(v.data(), d_v, bytes).wait();
-                        });
-                    }
-                });
-            oneapi::dpl::experimental::wait(p.get_submission_group());
-        }
-
-        int count = ecount.load();
-        if (count != i * (i + 1) / 2)
-        {
-            std::cout << "ERROR: scheduler did not execute all tasks exactly once\n";
-            return 1;
-        }
-    }
-    if (!pass)
-    {
-        std::cout << "ERROR: did not select expected resources\n";
-        return 1;
-    }
-    if constexpr (call_select_before_submit)
-    {
-        std::cout << "select then submit and wait on group: OK\n";
-    }
-    else
-    {
-        std::cout << "submit and wait on group: OK\n";
-    }
-    return 0;
-}
-
-template <bool call_select_before_submit, typename Policy, typename UniverseContainer>
-int
-test_auto_submit_and_wait(UniverseContainer u, int best_resource)
-{
-    using my_policy_t = Policy;
-
-    // they are cpus so this is ok
-    int j;
-    std::vector<double> v(1000000, 0.0);
-
-    my_policy_t p{u};
-    auto n_samples = u.size();
-
-    const int N = 10;
-    std::atomic<int> ecount = 0;
-    bool pass = true;
-
-    for (int i = 1; i <= N; ++i)
-    {
-        if (i <= 2 * n_samples && (i - 1) % n_samples != best_resource)
-        {
-            j = 100;
-        }
-        else
-        {
-            j = 0;
-        }
-        const size_t bytes = 1000000 * sizeof(double);
-        if constexpr (call_select_before_submit)
-        {
-            auto f = [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
-                if (i <= 2 * n_samples)
-                {
-                    // we should be round-robining through the resources
-                    if (q != u[(i - 1) % n_samples])
-                    {
-                        std::cout << i << ": mismatch during rr phase\n" << std::flush;
-                        pass = false;
-                    }
-                }
-                else
-                {
-                    if (q != u[best_resource])
-                    {
-                        std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
-                        pass = false;
-                    }
-                }
-                ecount += i;
-                if (j == 0)
-                {
-                     return q.submit([=](sycl::handler& h){
-                             h.single_task([=](){});
-                         });
-                }
-                else
-                {
-                    return q.submit([&](sycl::handler& h) {
-                        double *d_v = sycl::malloc_device<double>(1000000, q);
-                        q.memcpy(d_v, v.data(), bytes).wait();
-                        h.parallel_for(
-                            1000000, [=](sycl::id<1> idx) {
-                                for (int j0 = 0; j0 < j; ++j0)
-                                {
-                                    d_v[idx] += idx;
-                                }
-                            });
-                            q.memcpy(v.data(), d_v, bytes).wait();
-                    });
-                }
-            };
-            auto s = oneapi::dpl::experimental::select(p, f);
-            oneapi::dpl::experimental::submit_and_wait(s, f);
-        }
-        else
-        {
-            // it's ok to capture by reference since we are waiting on each call
-            oneapi::dpl::experimental::submit_and_wait(
-                p, [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
-                    if (i <= 2 * n_samples)
-                    {
-                        // we should be round-robining through the resources
-                        if (q != u[(i - 1) % n_samples])
-                        {
-                            std::cout << i << ": mismatch during rr phase\n" << std::flush;
-                            pass = false;
-                        }
-                    }
-                    else
-                    {
-                        if (q != u[best_resource])
-                        {
-                            std::cout << i << ": mismatch during prod phase " << best_resource << "\n" << std::flush;
-                            pass = false;
-                        }
-                    }
-                    ecount += i;
-                    if (j == 0)
-                    {
-                     return q.submit([=](sycl::handler& h){
-                             h.single_task([=](){});
-                         });
-                    }
-                    else
-                    {
-                        return q.submit([&](sycl::handler& h) {
-                            double *d_v = sycl::malloc_device<double>(1000000, q);
-                            q.memcpy(d_v, v.data(), bytes).wait();
-                            h.parallel_for(
-                                1000000, [=](sycl::id<1> idx) {
-                                    for (int j0 = 0; j0 < j; ++j0)
-                                    {
-                                        d_v[idx] += idx;
-                                    }
-                                });
-                            q.memcpy(v.data(), d_v, bytes).wait();
-                        });
-                    }
-                });
-        }
-
-        int count = ecount.load();
-        if (count != i * (i + 1) / 2)
-        {
-            std::cout << "ERROR: scheduler did not execute all tasks exactly once\n";
-            return 1;
-        }
-    }
-    if (!pass)
-    {
-        std::cout << "ERROR: did not select expected resources\n";
-        return 1;
-    }
-    if constexpr (call_select_before_submit)
-    {
-        std::cout << "select then submit_and_wait: OK\n";
-    }
-    else
-    {
-        std::cout << "submit_and_wait: OK\n";
-    }
-    return 0;
-}
-
-static inline void
-build_auto_tune_universe(std::vector<sycl::queue>& u)
-{
-    auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
-    try
-    {
-        auto device_cpu = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu_queue{device_cpu, prop_list};
-        run_sycl_sanity_test(cpu_queue);
-        u.push_back(cpu_queue);
-    }
-    catch (const sycl::exception&)
-    {
-        std::cout << "SKIPPED: Unable to run with cpu_selector\n";
-    }
-    try
-    {
-        auto device_gpu = sycl::device(sycl::gpu_selector_v);
-        sycl::queue gpu_queue{device_gpu, prop_list};
-        run_sycl_sanity_test(gpu_queue);
-        u.push_back(gpu_queue);
-    }
-    catch (const sycl::exception&)
-    {
-        std::cout << "SKIPPED: Unable to run with gpu_selector\n";
-    }
-}
-
-#endif
-
-int
-main()
-{
-#if TEST_DYNAMIC_SELECTION_AVAILABLE
-    using policy_t = oneapi::dpl::experimental::auto_tune_policy<oneapi::dpl::experimental::sycl_backend>;
-    std::vector<sycl::queue> u;
-    build_auto_tune_universe(u);
-
-    //If building the universe is not a success, return
-    if (u.size() == 0 || u.size()==0)
-        return 0;
-
-    constexpr bool just_call_submit = false;
-    constexpr bool call_select_before_submit = true;
-
-    if (test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) /*||
-        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1) ||
-         test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1)||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0) ||
-        test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1) ||
-        // now select then submits
-        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0) ||
-        test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1)*/)
-    {
-        std::cout << "FAIL\n";
-        return 1;
-    }
-    else
-    {
-        std::cout << "PASS\n";
-        return 0;
-    }
-#else
-    std::cout << "SKIPPED\n";
-    return 0;
-#endif // TEST_DYNAMIC_SELECTION_AVAILABLE
-}
diff --git a/test/support/test_dynamic_selection_utils.h b/test/support/test_dynamic_selection_utils.h
index dded1a1362f..3e35cd2242b 100644
--- a/test/support/test_dynamic_selection_utils.h
+++ b/test/support/test_dynamic_selection_utils.h
@@ -97,6 +97,7 @@ test_select(UniverseContainer u, ResourceFunction&& f)
             return 1;
         }
     }
+
     if (!pass)
     {
         std::cout << "ERROR: did not select expected resources\n";

From a38327a1e5eb28b3e3204b7bd69a93d6e4e2482e Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 22 Mar 2024 10:42:22 -0500
Subject: [PATCH 05/30] Corrections in dynamic traits

---
 .../dynamic_selection_impl/auto_tune_policy.h |  6 ++---
 .../dynamic_selection_impl/sycl_backend.h     | 23 ++++++++-----------
 .../dpl/internal/dynamic_selection_traits.h   | 12 ++++++++--
 .../sycl/test_auto_tune_policy_sycl.pass.cpp  | 15 ++++--------
 test/support/test_dynamic_selection_utils.h   |  1 -
 5 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 215e50fc72d..eba8f7ec8ae 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -125,7 +125,6 @@ class auto_tune_policy
             new_value = (n * td.value_ + t) / (n + 1);
             td.num_timings_ = n + 1;
             td.value_ = new_value;
-
             if (new_value < best_timing_)
             {
                 best_timing_ = new_value;
@@ -144,7 +143,8 @@ class auto_tune_policy
       public:
         auto_tune_selection_type(const policy_t& p, resource_with_index_t r, std::shared_ptr<tuner_t> t)
             : policy_(p), resource_(r), tuner_(::std::move(t))
-        {}
+        {
+        }
 
         auto
         unwrap()
@@ -219,7 +219,7 @@ class auto_tune_policy
             }
             else
             {
-                if constexpr(has_lazy_report<Backend>::value){
+                if constexpr(lazy_report_v<Backend>){
                         backend_->lazy_report();
                 }
                 auto r = state_->resources_with_index_[index];
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 279696cddc3..74b1bf7e3ba 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -19,7 +19,6 @@
 #include <vector>
 #include <memory>
 #include <utility>
-#include <atomic>
 
 namespace oneapi
 {
@@ -37,22 +36,22 @@ class sycl_backend
     using resource_container_t = std::vector<execution_resource_t>;
 
   private:
-    class storage_base{
+    class async_waiter_base{
         public:
             virtual void wait() = 0;
             virtual void report() = 0;
             virtual bool is_complete() = 0;
     };
 
-    std::vector<storage_base*> storage_arr;
+    std::vector<async_waiter_base*> async_waiter_arr;
 
     template<typename T>
-    void addStorage(T *t){
-        storage_arr.push_back(t);
+    void add_waiter(T *t){
+        async_waiter_arr.push_back(t);
     }
 
     template<typename Selection>
-    class async_waiter : public storage_base
+    class async_waiter : public async_waiter_base
     {
         sycl::event e_;
         Selection* s;
@@ -123,7 +122,6 @@ class sycl_backend
     {
         initialize_default_resources();
         sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
-        number_of_resources=global_rank_.size();
     }
 
     template <typename NativeUniverseVector>
@@ -172,12 +170,12 @@ class sycl_backend
                 if (use_event_profiling)
                 {
                     auto waiter = async_waiter{e1, new SelectionHandle(s)};
-                    addStorage(new async_waiter(waiter));
+                    add_waiter(new async_waiter(waiter));
                     return waiter;
                 }
                 else{
                     auto waiter = async_waiter{e1, new SelectionHandle(s), t0};
-                    addStorage(new async_waiter(waiter));
+                    add_waiter(new async_waiter(waiter));
                     return waiter;
                 }
             }
@@ -201,16 +199,15 @@ class sycl_backend
     }
 
     void lazy_report(){
-        int size = storage_arr.size();
-        for(auto i = storage_arr.begin(); i!=storage_arr.begin()+size; i++){
+        int size = async_waiter_arr.size();
+        for(auto i = async_waiter_arr.begin(); i!=async_waiter_arr.begin()+size; i++){
             if((*i)->is_complete()){
                 (*i)->report();
-                storage_arr.erase(i);
+                async_waiter_arr.erase(i);
             }
         }
     }
   private:
-    std::atomic<int> number_of_resources;;
     resource_container_t global_rank_;
     std::unique_ptr<submission_group> sgroup_ptr_;
 
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index f61a0e29461..0a63861bb38 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -127,7 +127,7 @@ has_lazy_report_impl(...) -> std::false_type;
 
 template <typename Backend>
 auto
-has_lazy_report_impl() -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
+has_lazy_report_impl(int) -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
 
 template <typename Backend>
 struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
@@ -311,7 +311,7 @@ struct report_value
 template <typename S, typename Info>
 inline constexpr bool report_value_v = report_value<S, Info>::value;
 
-template <typename T>
+/*template <typename T>
 auto
 has_lazy_report_impl(...) -> std::false_type;
 
@@ -322,7 +322,15 @@ has_lazy_report_impl(int) -> decltype(std::declval<T>().lazy_report(), std::true
 template <typename T>
 struct has_lazy_report : decltype(has_lazy_report_impl<T>(0))
 {
+};*/
+
+template <typename S>
+struct lazy_report_value
+{
+    static constexpr bool value = internal::has_lazy_report<S>::value;
 };
+template <typename S>
+inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
 
 } // namespace experimental
 } // namespace dpl
diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
index 56d4ebae1b7..3f8a0ca606e 100644
--- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
+++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
@@ -157,15 +157,13 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                     ecount += i;
                     if (*j == 0)
                     {
-                         auto x = q.submit([=](sycl::handler& h){
+                         return q.submit([=](sycl::handler& h){
                             h.single_task([](){});
                          });
-                         return x;
-
                     }
                     else
                     {
-                        auto x = q.submit([=](sycl::handler& h) {
+                        return q.submit([=](sycl::handler& h) {
                             h.parallel_for<TestUtils::unique_kernel_name<
                                 class tune2, TestUtils::uniq_kernel_index<sycl::usm::alloc::shared>()>>(
                                 1000000, [=](sycl::id<1> idx) {
@@ -175,7 +173,6 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                                     }
                                 });
                         });
-                        return x;
                     }
                 });
             oneapi::dpl::experimental::wait(s);
@@ -451,14 +448,13 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                     ecount += i;
                     if (*j == 0)
                     {
-                         auto x = q.submit([=](sycl::handler& h){
+                         return q.submit([=](sycl::handler& h){
                             h.single_task([](){});
                          });
-                        return x;
                     }
                     else
                     {
-                        auto x =  q.submit([=](sycl::handler& h) {
+                        return q.submit([=](sycl::handler& h) {
                             h.parallel_for<TestUtils::unique_kernel_name<
                                 class tune6, TestUtils::uniq_kernel_index<sycl::usm::alloc::shared>()>>(
                                 1000000, [=](sycl::id<1> idx) {
@@ -468,7 +464,6 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                                     }
                                 });
                         });
-                        return x;
                     }
                 });
         }
@@ -562,7 +557,7 @@ main()
     build_auto_tune_universe<use_event_profiling>(u2);
 
     //If building the universe is not a success, return
-    if (u1.size() == 0)
+    if (u1.size() == 0 || u2.size()==0)
         return 0;
 
     auto f = [u1](int i) {
diff --git a/test/support/test_dynamic_selection_utils.h b/test/support/test_dynamic_selection_utils.h
index 3e35cd2242b..dded1a1362f 100644
--- a/test/support/test_dynamic_selection_utils.h
+++ b/test/support/test_dynamic_selection_utils.h
@@ -97,7 +97,6 @@ test_select(UniverseContainer u, ResourceFunction&& f)
             return 1;
         }
     }
-
     if (!pass)
     {
         std::cout << "ERROR: did not select expected resources\n";

From 70f993cbd03109d10b30cb6315b99e9ba56f4fd8 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 22 Mar 2024 10:59:15 -0500
Subject: [PATCH 06/30] Remove comments in dynamic_selection_traits

---
 .../oneapi/dpl/internal/dynamic_selection_traits.h  | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index 0a63861bb38..b5b220cd1e5 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -311,19 +311,6 @@ struct report_value
 template <typename S, typename Info>
 inline constexpr bool report_value_v = report_value<S, Info>::value;
 
-/*template <typename T>
-auto
-has_lazy_report_impl(...) -> std::false_type;
-
-template <typename T>
-auto
-has_lazy_report_impl(int) -> decltype(std::declval<T>().lazy_report(), std::true_type{});
-
-template <typename T>
-struct has_lazy_report : decltype(has_lazy_report_impl<T>(0))
-{
-};*/
-
 template <typename S>
 struct lazy_report_value
 {

From 69f5014511f88b97564e405ca112f87d180e8d56 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 22 Mar 2024 15:50:27 -0500
Subject: [PATCH 07/30] Adding back jit compolation restrictions

---
 .../dynamic_selection_impl/auto_tune_policy.h | 18 +++++++++----
 .../dynamic_selection_impl/sycl_backend.h     | 26 ++++++++-----------
 .../sycl/test_auto_tune_policy_sycl.pass.cpp  |  1 -
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 3bcb0c52772..5fddd9b8467 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -122,11 +122,19 @@ class auto_tune_policy
             std::lock_guard<std::mutex> l(m_);
             auto index = r.index_;
             timing_t new_value = t;
-            auto& td = time_by_index_[index];
-            auto n = td.num_timings_;
-            new_value = (n * td.value_ + t) / (n + 1);
-            td.num_timings_ = n + 1;
-            td.value_ = new_value;
+            if (time_by_index_.count(index) == 0)
+            {
+                // ignore the 1st timing to cover for JIT compilation
+                time_by_index_[index] = time_data_t{0, std::numeric_limits<timing_t>::max()};
+            }
+            else
+            {
+                auto& td = time_by_index_[index];
+                auto n = td.num_timings_;
+                new_value = (n * td.value_ + t) / (n + 1);
+                td.num_timings_ = n + 1;
+                td.value_ = new_value;
+            }
             if (new_value < best_timing_)
             {
                 best_timing_ = new_value;
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index b39b70e683a..a7e00f6e8cd 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -55,11 +55,9 @@ class sycl_backend
     {
         sycl::event e_;
         Selection* s;
-        std::optional<std::chrono::steady_clock::time_point> timing;
-
       public:
         async_waiter(sycl::event e) : e_(e){}
-        async_waiter(sycl::event e, Selection* selection, std::optional<std::chrono::steady_clock::time_point> t=std::nullopt) : e_(e), s(selection), timing(t) {}
+        async_waiter(sycl::event e, Selection* selection) : e_(e), s(selection) {}
 
         sycl::event
         unwrap()
@@ -76,15 +74,9 @@ class sycl_backend
         void
         report() override{
             if constexpr (report_value_v<Selection, execution_info::task_time_t>){
-                if (!timing.has_value())
-                {
-                    cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
-                    cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
-                    s->report(execution_info::task_time, time_end-time_start);
-                }else{
-                    auto t = timing.value();
-                    s->report(execution_info::task_time, (std::chrono::steady_clock::now() - t).count());
-                }
+                cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
+                cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
+                s->report(execution_info::task_time, time_end-time_start);
             }
 
         }
@@ -175,9 +167,13 @@ class sycl_backend
                     return waiter;
                 }
                 else{
-                    auto waiter = async_waiter{e1, new SelectionHandle(s), t0};
-                    add_waiter(new async_waiter(waiter));
-                    return waiter;
+                    auto e2 = q.submit([=](sycl::handler& h){
+                        h.depends_on(e1);
+                        h.host_task([=](){
+                            s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
+                        });
+                    });
+                    return async_waiter{e2, new SelectionHandle(s)};
                 }
             }
         }
diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
index bdb9bf74549..23702af4c02 100644
--- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
+++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
@@ -513,7 +513,6 @@ main()
     bool bProcessed = false;
 
 #if TEST_DYNAMIC_SELECTION_AVAILABLE
-    std::cout<<"HERE\n";
     using policy_t = oneapi::dpl::experimental::auto_tune_policy<oneapi::dpl::experimental::sycl_backend>;
     std::vector<sycl::queue> u1;
     std::vector<sycl::queue> u2;

From 757c4701cc62f05f81667899cb07e227a0cacc3d Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Wed, 27 Mar 2024 11:18:17 -0500
Subject: [PATCH 08/30] Adressing comments to add thread safety, better traits

---
 .../dynamic_selection_impl/auto_tune_policy.h |  6 +--
 .../dynamic_selection_impl/sycl_backend.h     | 40 ++++++++++++-------
 .../dpl/internal/dynamic_selection_traits.h   | 18 +++++----
 3 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 5fddd9b8467..8557f15fa3d 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -217,6 +217,9 @@ class auto_tune_policy
     select(Function&& f, Args&&... args)
     {
         static_assert(sizeof...(KeyArgs) == sizeof...(Args));
+        if constexpr(backend_traits::lazy_report_v<Backend>){
+                backend_->lazy_report();
+        }
         if (state_)
         {
             std::lock_guard<std::mutex> l(state_->m_);
@@ -229,9 +232,6 @@ class auto_tune_policy
             }
             else
             {
-                if constexpr(lazy_report_v<Backend>){
-                        backend_->lazy_report();
-                }
                 auto r = state_->resources_with_index_[index];
                 return selection_type{*this, r, t};
             }
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index a7e00f6e8cd..11a475ccbc6 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -43,12 +43,29 @@ class sycl_backend
             virtual bool is_complete() = 0;
     };
 
-    std::vector<async_waiter_base*> async_waiter_arr;
+    struct async_waiter_arr{
 
-    template<typename T>
-    void add_waiter(T *t){
-        async_waiter_arr.push_back(t);
-    }
+        std::mutex m_;
+        std::vector<async_waiter_base*> async_waiters;
+
+        template<typename T>
+        void add_waiter(T *t){
+            std::lock_guard<std::mutex> l(m_);
+            async_waiters.push_back(t);
+        }
+
+        void lazy_report(){
+            int size = async_waiters.size();
+            for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){
+                if((*i)->is_complete()){
+                    (*i)->report();
+                    async_waiters.erase(i);
+                }
+            }
+        }
+    };
+
+    async_waiter_arr async_waiter_arr;
 
     template<typename Selection>
     class async_waiter : public async_waiter_base
@@ -159,11 +176,11 @@ class sycl_backend
                 });
                 return async_waiter{e2, new SelectionHandle(s)};
             }
-            if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
+            else if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
                 if (use_event_profiling)
                 {
                     auto waiter = async_waiter{e1, new SelectionHandle(s)};
-                    add_waiter(new async_waiter(waiter));
+                    async_waiter_arr.add_waiter(new async_waiter(waiter));
                     return waiter;
                 }
                 else{
@@ -196,14 +213,9 @@ class sycl_backend
     }
 
     void lazy_report(){
-        int size = async_waiter_arr.size();
-        for(auto i = async_waiter_arr.begin(); i!=async_waiter_arr.begin()+size; i++){
-            if((*i)->is_complete()){
-                (*i)->report();
-                async_waiter_arr.erase(i);
-            }
-        }
+        async_waiter_arr.lazy_report();
     }
+
   private:
     resource_container_t global_rank_;
     std::unique_ptr<submission_group> sgroup_ptr_;
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index b5b220cd1e5..4d6117f9ef1 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -135,6 +135,16 @@ struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
 };
 } //namespace internal
 
+namespace backend_traits {
+    template <typename S>
+    struct lazy_report_value
+    {
+        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report<S>::value;
+    };
+    template <typename S>
+    inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
+} //namespace backend_traits
+
 struct deferred_initialization_t
 {
 };
@@ -311,14 +321,6 @@ struct report_value
 template <typename S, typename Info>
 inline constexpr bool report_value_v = report_value<S, Info>::value;
 
-template <typename S>
-struct lazy_report_value
-{
-    static constexpr bool value = internal::has_lazy_report<S>::value;
-};
-template <typename S>
-inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
-
 } // namespace experimental
 } // namespace dpl
 } // namespace oneapi

From 37a895a3c79d14dc8c6daf9890ea5fd0ef596258 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Wed, 27 Mar 2024 15:12:38 -0500
Subject: [PATCH 09/30] Added backend trait to check if profiling is enabled

---
 .../dynamic_selection_impl/auto_tune_policy.h     |  4 +++-
 .../dynamic_selection_impl/sycl_backend.h         |  4 ++++
 .../dpl/internal/dynamic_selection_traits.h       | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 8557f15fa3d..89e118feb0e 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -217,8 +217,10 @@ class auto_tune_policy
     select(Function&& f, Args&&... args)
     {
         static_assert(sizeof...(KeyArgs) == sizeof...(Args));
-        if constexpr(backend_traits::lazy_report_v<Backend>){
+        if constexpr(backend_traits::lazy_report_v<Backend> && backend_traits::enable_profiling_v<Backend>){
+            if(backend_->has_enable_profiling == true){
                 backend_->lazy_report();
+            }
         }
         if (state_)
         {
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 11a475ccbc6..77f1d013d31 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -35,6 +35,7 @@ class sycl_backend
     using execution_resource_t = resource_type;
     using resource_container_t = std::vector<execution_resource_t>;
 
+    bool has_enable_profiling = false;
   private:
     class async_waiter_base{
         public:
@@ -141,6 +142,9 @@ class sycl_backend
         for (auto e : v)
         {
             global_rank_.push_back(e);
+            if(e.template has_property<sycl::property::queue::enable_profiling>()){
+                has_enable_profiling = true;
+            }
         }
         sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
     }
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index 4d6117f9ef1..1274c1df301 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -133,6 +133,13 @@ template <typename Backend>
 struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
 {
 };
+
+template <typename Backend, typename U = bool>
+struct has_enable_profiling : std::false_type { };
+
+// Specialization for U = int
+template <typename Backend>
+struct has_enable_profiling <Backend, decltype((void) Backend::has_enable_profiling, true)> : std::true_type { };
 } //namespace internal
 
 namespace backend_traits {
@@ -143,6 +150,14 @@ namespace backend_traits {
     };
     template <typename S>
     inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
+
+    template <typename S>
+    struct enable_profiling_value
+    {
+        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling<S>::value;
+    };
+    template <typename S>
+    inline constexpr bool enable_profiling_v = enable_profiling_value<S>::value;
 } //namespace backend_traits
 
 struct deferred_initialization_t

From d9fe635f664f790d8a42a6c24325f68120da7ae7 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Thu, 4 Apr 2024 11:43:16 -0500
Subject: [PATCH 10/30] Backend traits, renaming and adding a lock to lazy
 report

---
 .../dynamic_selection_impl/auto_tune_policy.h |  1 +
 .../dynamic_selection_impl/backend_traits.h   | 66 +++++++++++++++++++
 .../dynamic_load_policy.h                     |  5 ++
 .../dynamic_selection_impl/sycl_backend.h     | 53 +++++++--------
 .../dpl/internal/dynamic_selection_traits.h   | 37 -----------
 5 files changed, 99 insertions(+), 63 deletions(-)
 create mode 100644 include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 89e118feb0e..f551ecb77dc 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -22,6 +22,7 @@
 #include <tuple>
 #include <unordered_map>
 #include "oneapi/dpl/internal/dynamic_selection_traits.h"
+#include "oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h"
 #if _DS_BACKEND_SYCL != 0
 #    include "oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h"
 #endif
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
new file mode 100644
index 00000000000..91181ea2dd8
--- /dev/null
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
@@ -0,0 +1,66 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_INTERNAL_BACKEND_TRAITS_H
+#define _ONEDPL_INTERNAL_BACKEND_TRAITS_H
+
+#include "oneapi/dpl/internal/dynamic_selection_traits.h"
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace experimental
+{
+namespace internal
+{
+    template <typename Backend>
+    auto
+    has_lazy_report_impl(...) -> std::false_type;
+
+    template <typename Backend>
+    auto
+    has_lazy_report_impl(int) -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
+
+    template <typename Backend>
+    struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
+    {
+    };
+
+    template <typename Backend, typename U = bool>
+    struct has_enable_profiling : std::false_type { };
+
+    // Specialization for U = int
+    template <typename Backend>
+    struct has_enable_profiling <Backend, decltype((void) Backend::has_enable_profiling, true)> : std::true_type { };
+}//namespace internal
+
+namespace backend_traits {
+    template <typename S>
+    struct lazy_report_value
+    {
+        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report<S>::value;
+    };
+    template <typename S>
+    inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
+
+    template <typename S>
+    struct enable_profiling_value
+    {
+        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling<S>::value;
+    };
+    template <typename S>
+    inline constexpr bool enable_profiling_v = enable_profiling_value<S>::value;
+} //namespace backend_traits
+
+} // namespace experimental
+} // namespace dpl
+} // namespace oneapi
+
+#endif /*_ONEDPL_INTERNAL_BACKEND_TRAITS_H*/
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
index d865b9b7fba..0886facb02b 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
@@ -150,6 +150,11 @@ struct dynamic_load_policy
     selection_type
     select(Args&&...)
     {
+        if constexpr(backend_traits::lazy_report_v<Backend> && backend_traits::enable_profiling_v<Backend>){
+            if(backend_->has_enable_profiling == true){
+                backend_->lazy_report();
+            }
+        }
         if (state_)
         {
             std::lock_guard<std::mutex> l(state_->m_);
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 77f1d013d31..88aa63ea231 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -44,30 +44,6 @@ class sycl_backend
             virtual bool is_complete() = 0;
     };
 
-    struct async_waiter_arr{
-
-        std::mutex m_;
-        std::vector<async_waiter_base*> async_waiters;
-
-        template<typename T>
-        void add_waiter(T *t){
-            std::lock_guard<std::mutex> l(m_);
-            async_waiters.push_back(t);
-        }
-
-        void lazy_report(){
-            int size = async_waiters.size();
-            for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){
-                if((*i)->is_complete()){
-                    (*i)->report();
-                    async_waiters.erase(i);
-                }
-            }
-        }
-    };
-
-    async_waiter_arr async_waiter_arr;
-
     template<typename Selection>
     class async_waiter : public async_waiter_base
     {
@@ -106,6 +82,31 @@ class sycl_backend
 
     };
 
+    struct async_waiter_list_t{
+
+        std::mutex m_;
+        std::vector<async_waiter_base*> async_waiters;
+
+        template<typename T>
+        void add_waiter(T *t){
+            std::lock_guard<std::mutex> l(m_);
+            async_waiters.push_back(t);
+        }
+
+        void lazy_report(){
+            std::lock_guard<std::mutex> l(m_);
+            int size = async_waiters.size();
+            for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){
+                if((*i)->is_complete()){
+                    (*i)->report();
+                    async_waiters.erase(i);
+                }
+            }
+        }
+    };
+
+    async_waiter_list_t async_waiter_list;
+
 
     class submission_group
     {
@@ -184,7 +185,7 @@ class sycl_backend
                 if (use_event_profiling)
                 {
                     auto waiter = async_waiter{e1, new SelectionHandle(s)};
-                    async_waiter_arr.add_waiter(new async_waiter(waiter));
+                    async_waiter_list.add_waiter(new async_waiter(waiter));
                     return waiter;
                 }
                 else{
@@ -217,7 +218,7 @@ class sycl_backend
     }
 
     void lazy_report(){
-        async_waiter_arr.lazy_report();
+        async_waiter_list.lazy_report();
     }
 
   private:
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index 1274c1df301..c1955a86c42 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -121,45 +121,8 @@ struct has_report_value : decltype(has_report_value_impl<S, Info>(0))
 {
 };
 
-template <typename Backend>
-auto
-has_lazy_report_impl(...) -> std::false_type;
-
-template <typename Backend>
-auto
-has_lazy_report_impl(int) -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
-
-template <typename Backend>
-struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
-{
-};
-
-template <typename Backend, typename U = bool>
-struct has_enable_profiling : std::false_type { };
-
-// Specialization for U = int
-template <typename Backend>
-struct has_enable_profiling <Backend, decltype((void) Backend::has_enable_profiling, true)> : std::true_type { };
 } //namespace internal
 
-namespace backend_traits {
-    template <typename S>
-    struct lazy_report_value
-    {
-        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report<S>::value;
-    };
-    template <typename S>
-    inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
-
-    template <typename S>
-    struct enable_profiling_value
-    {
-        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling<S>::value;
-    };
-    template <typename S>
-    inline constexpr bool enable_profiling_v = enable_profiling_value<S>::value;
-} //namespace backend_traits
-
 struct deferred_initialization_t
 {
 };

From 95ee3acb42e816055601c3b7ff5d68f5e07e2b15 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Tue, 9 Apr 2024 16:26:51 -0500
Subject: [PATCH 11/30] Fixed memory leaks

---
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h   | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 88aa63ea231..66e5b77ac59 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -42,6 +42,7 @@ class sycl_backend
             virtual void wait() = 0;
             virtual void report() = 0;
             virtual bool is_complete() = 0;
+            virtual ~async_waiter_base(){}
     };
 
     template<typename Selection>
@@ -85,20 +86,20 @@ class sycl_backend
     struct async_waiter_list_t{
 
         std::mutex m_;
-        std::vector<async_waiter_base*> async_waiters;
+        std::vector<std::unique_ptr<async_waiter_base>> async_waiters;
 
         template<typename T>
         void add_waiter(T *t){
             std::lock_guard<std::mutex> l(m_);
-            async_waiters.push_back(t);
+            async_waiters.push_back(std::unique_ptr<T>(t));
         }
 
         void lazy_report(){
             std::lock_guard<std::mutex> l(m_);
             int size = async_waiters.size();
             for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){
-                if((*i)->is_complete()){
-                    (*i)->report();
+                if(i->get()->is_complete()){
+                    i->get()->report();
                     async_waiters.erase(i);
                 }
             }

From cd5b3182e59504f270604113d1e876e6fd35e797 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Mon, 15 Apr 2024 10:35:58 -0500
Subject: [PATCH 12/30] No nullptr for selection handle in async waiter
 constructor

---
 .../dynamic_selection_impl/sycl_backend.h     | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 66e5b77ac59..c88238459df 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -42,18 +42,24 @@ class sycl_backend
             virtual void wait() = 0;
             virtual void report() = 0;
             virtual bool is_complete() = 0;
-            virtual ~async_waiter_base(){}
+            virtual ~async_waiter_base() = default;
     };
 
     template<typename Selection>
     class async_waiter : public async_waiter_base
     {
         sycl::event e_;
-        Selection* s;
+        std::shared_ptr<Selection> s;
       public:
-        async_waiter(sycl::event e) : e_(e){}
-        async_waiter(sycl::event e, Selection* selection) : e_(e), s(selection) {}
-
+        async_waiter(sycl::event e) : e_(e) {}
+        async_waiter(sycl::event e, std::shared_ptr<Selection> selection) : e_(e), s(selection) {}
+
+        async_waiter(async_waiter &w) : e_(w.e_), s(w.s) {}
+        async_waiter& operator=(async_waiter &w){
+            s = w.s;
+            e_ = w.e_;
+            return *this;
+        }
         sycl::event
         unwrap()
         {
@@ -86,7 +92,7 @@ class sycl_backend
     struct async_waiter_list_t{
 
         std::mutex m_;
-        std::vector<std::unique_ptr<async_waiter_base>> async_waiters;
+        std::vector<std::shared_ptr<async_waiter_base>> async_waiters;
 
         template<typename T>
         void add_waiter(T *t){
@@ -180,12 +186,12 @@ class sycl_backend
                         s.report(execution_info::task_completion);
                     });
                 });
-                return async_waiter{e2, new SelectionHandle(s)};
+                return async_waiter{e2, std::make_shared<SelectionHandle>(s)};
             }
             else if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
                 if (use_event_profiling)
                 {
-                    auto waiter = async_waiter{e1, new SelectionHandle(s)};
+                    auto waiter = async_waiter{e1,std::make_shared<SelectionHandle>(s)};
                     async_waiter_list.add_waiter(new async_waiter(waiter));
                     return waiter;
                 }
@@ -196,13 +202,13 @@ class sycl_backend
                             s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
                         });
                     });
-                    return async_waiter{e2, new SelectionHandle(s)};
+                    return async_waiter{e2, std::make_shared<SelectionHandle>(s)};
                 }
             }
         }
         else
         {
-            return async_waiter{f(unwrap(s), std::forward<Args>(args)...), new SelectionHandle(s)};
+            return async_waiter{f(unwrap(s), std::forward<Args>(args)...), std::make_shared<SelectionHandle>(s)};
         }
     }
 

From e58f2640f09a9a3411ab6cf33a0b6926bcd268a0 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Mon, 15 Apr 2024 11:44:00 -0500
Subject: [PATCH 13/30] Added an erase and remove_if

---
 .../dynamic_selection_impl/sycl_backend.h         | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index c88238459df..030ce47bfdc 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include <memory>
 #include <utility>
+#include <algorithm>
 
 namespace oneapi
 {
@@ -102,13 +103,13 @@ class sycl_backend
 
         void lazy_report(){
             std::lock_guard<std::mutex> l(m_);
-            int size = async_waiters.size();
-            for(auto i = async_waiters.begin(); i!=async_waiters.begin()+size; i++){
-                if(i->get()->is_complete()){
-                    i->get()->report();
-                    async_waiters.erase(i);
-                }
-            }
+            async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::shared_ptr<async_waiter_base>& async_waiter){
+                    if(async_waiter->is_complete()){
+                        async_waiter->report();
+                        return true;
+                    }
+                    return false;
+                }), async_waiters.end());
         }
     };
 

From e134770e1db5e191e8e3dea016a65a72c382397f Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Wed, 24 Apr 2024 16:10:31 -0500
Subject: [PATCH 14/30] Adressing comments for profiling report, Adding
 profiling to default sycl backend constructor

---
 .../dynamic_selection_impl/auto_tune_policy.h | 18 ++---
 .../dynamic_selection_impl/backend_traits.h   | 17 +----
 .../dynamic_load_policy.h                     |  6 +-
 .../dynamic_selection_impl/sycl_backend.h     | 68 ++++++++++++-------
 4 files changed, 58 insertions(+), 51 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index f551ecb77dc..cc74f845075 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -16,6 +16,7 @@
 #include <mutex>
 #include <utility>
 #include <chrono>
+#include <ratio>
 #include <limits>
 #include <vector>
 #include <type_traits>
@@ -48,6 +49,9 @@ class auto_tune_policy
     using size_type = typename std::vector<typename Backend::resource_type>::size_type;
     using timing_t = uint64_t;
 
+    using report_clock_type = std::chrono::steady_clock;
+    using report_duration = std::chrono::duration<double, std::milli>;
+
     static constexpr timing_t never_resample = 0;
     static constexpr size_type use_best_resource = ~size_type(0);
 
@@ -67,7 +71,7 @@ class auto_tune_policy
     {
         std::mutex m_;
 
-        std::chrono::steady_clock::time_point t0_;
+        report_clock_type::time_point t0_;
 
         timing_t best_timing_ = std::numeric_limits<timing_t>::max();
         resource_with_index_t best_resource_;
@@ -81,7 +85,7 @@ class auto_tune_policy
         timing_t resample_time_ = 0.0;
 
         tuner_t(resource_with_index_t br, size_type resources_size, timing_t rt)
-            : t0_(std::chrono::steady_clock::now()), best_resource_(br), max_resource_to_profile_(resources_size),
+            : t0_(report_clock_type::now()), best_resource_(br), max_resource_to_profile_(resources_size),
               resample_time_(rt)
         {
         }
@@ -101,8 +105,8 @@ class auto_tune_policy
             }
             else
             {
-                auto now = std::chrono::steady_clock::now();
-                auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - t0_).count();
+                auto now = report_clock_type::now();
+                auto ms = report_duration(now - t0_).count();
                 if (ms < resample_time_)
                 {
                     return use_best_resource;
@@ -218,10 +222,8 @@ class auto_tune_policy
     select(Function&& f, Args&&... args)
     {
         static_assert(sizeof...(KeyArgs) == sizeof...(Args));
-        if constexpr(backend_traits::lazy_report_v<Backend> && backend_traits::enable_profiling_v<Backend>){
-            if(backend_->has_enable_profiling == true){
-                backend_->lazy_report();
-            }
+        if constexpr(backend_traits::lazy_report_v<Backend>){
+            backend_->lazy_report();
         }
         if (state_)
         {
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
index 91181ea2dd8..b50a5b3a27c 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
@@ -10,7 +10,9 @@
 #ifndef _ONEDPL_INTERNAL_BACKEND_TRAITS_H
 #define _ONEDPL_INTERNAL_BACKEND_TRAITS_H
 
-#include "oneapi/dpl/internal/dynamic_selection_traits.h"
+#include <utility>
+#include <cstdint>
+#include <type_traits>
 
 namespace oneapi
 {
@@ -33,12 +35,6 @@ namespace internal
     {
     };
 
-    template <typename Backend, typename U = bool>
-    struct has_enable_profiling : std::false_type { };
-
-    // Specialization for U = int
-    template <typename Backend>
-    struct has_enable_profiling <Backend, decltype((void) Backend::has_enable_profiling, true)> : std::true_type { };
 }//namespace internal
 
 namespace backend_traits {
@@ -50,13 +46,6 @@ namespace backend_traits {
     template <typename S>
     inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
 
-    template <typename S>
-    struct enable_profiling_value
-    {
-        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_enable_profiling<S>::value;
-    };
-    template <typename S>
-    inline constexpr bool enable_profiling_v = enable_profiling_value<S>::value;
 } //namespace backend_traits
 
 } // namespace experimental
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
index 0886facb02b..9a7e60cd6fb 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
@@ -150,10 +150,8 @@ struct dynamic_load_policy
     selection_type
     select(Args&&...)
     {
-        if constexpr(backend_traits::lazy_report_v<Backend> && backend_traits::enable_profiling_v<Backend>){
-            if(backend_->has_enable_profiling == true){
-                backend_->lazy_report();
-            }
+        if constexpr(backend_traits::lazy_report_v<Backend>){
+            backend_->lazy_report();
         }
         if (state_)
         {
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 030ce47bfdc..d9750e23159 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -16,6 +16,7 @@
 #include "oneapi/dpl/internal/dynamic_selection_impl/scoring_policy_defs.h"
 
 #include <chrono>
+#include <ratio>
 #include <vector>
 #include <memory>
 #include <utility>
@@ -36,7 +37,10 @@ class sycl_backend
     using execution_resource_t = resource_type;
     using resource_container_t = std::vector<execution_resource_t>;
 
-    bool has_enable_profiling = false;
+    using report_clock_type = std::chrono::steady_clock;
+    using report_duration = std::chrono::duration<double, std::milli>;
+
+    static inline bool is_profiling_enabled = false;
   private:
     class async_waiter_base{
         public:
@@ -56,11 +60,6 @@ class sycl_backend
         async_waiter(sycl::event e, std::shared_ptr<Selection> selection) : e_(e), s(selection) {}
 
         async_waiter(async_waiter &w) : e_(w.e_), s(w.s) {}
-        async_waiter& operator=(async_waiter &w){
-            s = w.s;
-            e_ = w.e_;
-            return *this;
-        }
         sycl::event
         unwrap()
         {
@@ -78,9 +77,12 @@ class sycl_backend
             if constexpr (report_value_v<Selection, execution_info::task_time_t>){
                 cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
                 cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
-                s->report(execution_info::task_time, time_end-time_start);
+                if(s!=nullptr){
+                    s->report(execution_info::task_time, report_duration(time_end-time_start).count());
+                }
             }
 
+
         }
 
         bool
@@ -93,7 +95,7 @@ class sycl_backend
     struct async_waiter_list_t{
 
         std::mutex m_;
-        std::vector<std::shared_ptr<async_waiter_base>> async_waiters;
+        std::vector<std::unique_ptr<async_waiter_base>> async_waiters;
 
         template<typename T>
         void add_waiter(T *t){
@@ -102,14 +104,16 @@ class sycl_backend
         }
 
         void lazy_report(){
-            std::lock_guard<std::mutex> l(m_);
-            async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::shared_ptr<async_waiter_base>& async_waiter){
-                    if(async_waiter->is_complete()){
-                        async_waiter->report();
-                        return true;
-                    }
-                    return false;
-                }), async_waiters.end());
+            if(is_profiling_enabled){
+                std::lock_guard<std::mutex> l(m_);
+                async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::unique_ptr<async_waiter_base>& async_waiter){
+                        if(async_waiter->is_complete()){
+                            async_waiter->report();
+                            return true;
+                        }
+                        return false;
+                    }), async_waiters.end());
+            }
         }
     };
 
@@ -147,14 +151,16 @@ class sycl_backend
     template <typename NativeUniverseVector>
     sycl_backend(const NativeUniverseVector& v)
     {
+        bool profiling = true;
         global_rank_.reserve(v.size());
         for (auto e : v)
         {
             global_rank_.push_back(e);
-            if(e.template has_property<sycl::property::queue::enable_profiling>()){
-                has_enable_profiling = true;
+            if(!e.template has_property<sycl::property::queue::enable_profiling>()){
+                profiling = false;
             }
         }
+        is_profiling_enabled = profiling;
         sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
     }
 
@@ -170,13 +176,12 @@ class sycl_backend
         if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t> ||
                       report_value_v<SelectionHandle, execution_info::task_time_t>)
         {
-            std::chrono::steady_clock::time_point t0;
-            bool use_event_profiling = q.template has_property<sycl::property::queue::enable_profiling>();
+            report_clock_type::time_point t0;
             if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
             {
-                if (!use_event_profiling)
+                if (!is_profiling_enabled)
                 {
-                    t0 = std::chrono::steady_clock::now();
+                    t0 = report_clock_type::now();
                 }
             }
             auto e1 = f(q, std::forward<Args>(args)...);
@@ -189,8 +194,8 @@ class sycl_backend
                 });
                 return async_waiter{e2, std::make_shared<SelectionHandle>(s)};
             }
-            else if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
-                if (use_event_profiling)
+            if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
+                if (is_profiling_enabled)
                 {
                     auto waiter = async_waiter{e1,std::make_shared<SelectionHandle>(s)};
                     async_waiter_list.add_waiter(new async_waiter(waiter));
@@ -200,7 +205,7 @@ class sycl_backend
                     auto e2 = q.submit([=](sycl::handler& h){
                         h.depends_on(e1);
                         h.host_task([=](){
-                            s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
+                            s.report(execution_info::task_time, (report_clock_type::now() - t0).count());
                         });
                     });
                     return async_waiter{e2, std::make_shared<SelectionHandle>(s)};
@@ -236,10 +241,23 @@ class sycl_backend
     void
     initialize_default_resources()
     {
+        bool profiling = true;
+        auto prop_list = sycl::property_list{};
         auto devices = sycl::device::get_devices();
         for (auto x : devices)
         {
             global_rank_.push_back(sycl::queue{x});
+            if(!x.has(sycl::aspect::queue_profiling)){
+                profiling = false;
+            }
+        }
+        is_profiling_enabled = profiling;
+        if(is_profiling_enabled){
+            prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
+        }
+        for (auto x : devices)
+        {
+            global_rank_.push_back(sycl::queue{x, prop_list});
         }
     }
 };

From 6ce3b5ea21a954bc10e4b61c7bb272b0d6bc0ffe Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Thu, 25 Apr 2024 11:16:17 -0500
Subject: [PATCH 15/30] Changes to std::chrono::duration

---
 .../internal/dynamic_selection_impl/auto_tune_policy.h    | 6 +++---
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h    | 8 +++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index cc74f845075..6688ffc6e02 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -50,7 +50,7 @@ class auto_tune_policy
     using timing_t = uint64_t;
 
     using report_clock_type = std::chrono::steady_clock;
-    using report_duration = std::chrono::duration<double, std::milli>;
+    using report_duration = std::chrono::milliseconds;
 
     static constexpr timing_t never_resample = 0;
     static constexpr size_type use_best_resource = ~size_type(0);
@@ -105,8 +105,8 @@ class auto_tune_policy
             }
             else
             {
-                auto now = report_clock_type::now();
-                auto ms = report_duration(now - t0_).count();
+                const auto now = report_clock_type::now();
+                const auto ms = std::chrono::duration_cast<report_duration>(now - t0_).count();
                 if (ms < resample_time_)
                 {
                     return use_best_resource;
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index d9750e23159..deaf31a03cf 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -38,7 +38,7 @@ class sycl_backend
     using resource_container_t = std::vector<execution_resource_t>;
 
     using report_clock_type = std::chrono::steady_clock;
-    using report_duration = std::chrono::duration<double, std::milli>;
+    using report_duration = std::chrono::milliseconds;
 
     static inline bool is_profiling_enabled = false;
   private:
@@ -78,7 +78,8 @@ class sycl_backend
                 cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
                 cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
                 if(s!=nullptr){
-                    s->report(execution_info::task_time, report_duration(time_end-time_start).count());
+                    const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start);
+                    s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(duration_in_ns).count());
                 }
             }
 
@@ -205,7 +206,8 @@ class sycl_backend
                     auto e2 = q.submit([=](sycl::handler& h){
                         h.depends_on(e1);
                         h.host_task([=](){
-                            s.report(execution_info::task_time, (report_clock_type::now() - t0).count());
+                            const auto tp_now = report_clock_type::now();
+                            s.report(execution_info::task_time, std::chrono::duration_cast<report_duration>(tp_now - t0).count());
                         });
                     });
                     return async_waiter{e2, std::make_shared<SelectionHandle>(s)};

From a8ad96bb3a4f5c1156999f1c34e9d5cf7487ae5e Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Thu, 25 Apr 2024 14:43:39 -0500
Subject: [PATCH 16/30] Addressed comments for sycl backend

---
 .../dynamic_selection_impl/sycl_backend.h     | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index deaf31a03cf..7473c8f8f79 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -44,7 +44,6 @@ class sycl_backend
   private:
     class async_waiter_base{
         public:
-            virtual void wait() = 0;
             virtual void report() = 0;
             virtual bool is_complete() = 0;
             virtual ~async_waiter_base() = default;
@@ -56,10 +55,9 @@ class sycl_backend
         sycl::event e_;
         std::shared_ptr<Selection> s;
       public:
-        async_waiter(sycl::event e) : e_(e) {}
+        async_waiter() = default;
         async_waiter(sycl::event e, std::shared_ptr<Selection> selection) : e_(e), s(selection) {}
 
-        async_waiter(async_waiter &w) : e_(w.e_), s(w.s) {}
         sycl::event
         unwrap()
         {
@@ -67,7 +65,7 @@ class sycl_backend
         }
 
         void
-        wait() override
+        wait()
         {
             e_.wait();
         }
@@ -75,9 +73,9 @@ class sycl_backend
         void
         report() override{
             if constexpr (report_value_v<Selection, execution_info::task_time_t>){
-                cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
-                cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
                 if(s!=nullptr){
+                    cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
+                    cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
                     const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start);
                     s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(duration_in_ns).count());
                 }
@@ -98,10 +96,9 @@ class sycl_backend
         std::mutex m_;
         std::vector<std::unique_ptr<async_waiter_base>> async_waiters;
 
-        template<typename T>
-        void add_waiter(T *t){
+        void add_waiter(async_waiter_base *t){
             std::lock_guard<std::mutex> l(m_);
-            async_waiters.push_back(std::unique_ptr<T>(t));
+            async_waiters.push_back(std::unique_ptr<async_waiter_base>(t));
         }
 
         void lazy_report(){
@@ -185,6 +182,7 @@ class sycl_backend
                     t0 = report_clock_type::now();
                 }
             }
+            async_waiter<SelectionHandle> waiter;
             auto e1 = f(q, std::forward<Args>(args)...);
             if constexpr(report_info_v<SelectionHandle, execution_info::task_completion_t>){
                 auto e2 = q.submit([=](sycl::handler& h){
@@ -193,14 +191,13 @@ class sycl_backend
                         s.report(execution_info::task_completion);
                     });
                 });
-                return async_waiter{e2, std::make_shared<SelectionHandle>(s)};
+                waiter =  async_waiter{e2, std::make_shared<SelectionHandle>(s)};
             }
             if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
                 if (is_profiling_enabled)
                 {
-                    auto waiter = async_waiter{e1,std::make_shared<SelectionHandle>(s)};
+                    waiter = async_waiter{e1,std::make_shared<SelectionHandle>(s)};
                     async_waiter_list.add_waiter(new async_waiter(waiter));
-                    return waiter;
                 }
                 else{
                     auto e2 = q.submit([=](sycl::handler& h){
@@ -210,9 +207,10 @@ class sycl_backend
                             s.report(execution_info::task_time, std::chrono::duration_cast<report_duration>(tp_now - t0).count());
                         });
                     });
-                    return async_waiter{e2, std::make_shared<SelectionHandle>(s)};
+                    waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
                 }
             }
+            return waiter;
         }
         else
         {
@@ -246,9 +244,8 @@ class sycl_backend
         bool profiling = true;
         auto prop_list = sycl::property_list{};
         auto devices = sycl::device::get_devices();
-        for (auto x : devices)
+        for (auto& x : devices)
         {
-            global_rank_.push_back(sycl::queue{x});
             if(!x.has(sycl::aspect::queue_profiling)){
                 profiling = false;
             }
@@ -257,7 +254,7 @@ class sycl_backend
         if(is_profiling_enabled){
             prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
         }
-        for (auto x : devices)
+        for (auto& x : devices)
         {
             global_rank_.push_back(sycl::queue{x, prop_list});
         }

From b70aeecdc933ec42c3fff526d06fb96b113806ca Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 26 Apr 2024 10:29:54 -0500
Subject: [PATCH 17/30] Addressed comments to sycl backend

---
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 7473c8f8f79..a053bdd5a7e 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -74,8 +74,8 @@ class sycl_backend
         report() override{
             if constexpr (report_value_v<Selection, execution_info::task_time_t>){
                 if(s!=nullptr){
-                    cl_ulong time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
-                    cl_ulong time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
+                    const auto time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
+                    const auto time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
                     const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start);
                     s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(duration_in_ns).count());
                 }
@@ -214,7 +214,7 @@ class sycl_backend
         }
         else
         {
-            return async_waiter{f(unwrap(s), std::forward<Args>(args)...), std::make_shared<SelectionHandle>(s)};
+            return async_waiter{f(q, std::forward<Args>(args)...), std::make_shared<SelectionHandle>(s)};
         }
     }
 

From 60710e2a909d18cb4c4771f577df25000698119d Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 26 Apr 2024 17:33:29 +0200
Subject: [PATCH 18/30] Fix USM memory leaks and create unique task names
 (#1537)

* test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp - fix USM shared memory leaks

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

* test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp - create unique task names for h.single_task([](){});

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

---------

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../sycl/test_auto_tune_policy_sycl.pass.cpp  | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
index 23702af4c02..09809a8492f 100644
--- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
+++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
@@ -14,6 +14,8 @@
 #include <thread>
 #include "support/test_dynamic_selection_utils.h"
 #include "support/utils.h"
+#include "support/sycl_alloc_utils.h"
+
 #if TEST_DYNAMIC_SELECTION_AVAILABLE
 
 int
@@ -53,8 +55,11 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
     using my_policy_t = Policy;
 
     // they are cpus so this is ok
-    double* v = sycl::malloc_shared<double>(1000000, u[0]);
-    int* j = sycl::malloc_shared<int>(1, u[0]);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, double> dt_helper_v(u[0], 1000000);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, int> dt_helper_j(u[0], 1);
+
+    double* v = dt_helper_v.get_data();
+    int* j = dt_helper_j.get_data();
 
     my_policy_t p{u};
     auto n_samples = u.size();
@@ -102,7 +107,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                 if (*j == 0)
                 {
                      return q.submit([=](sycl::handler& h){
-                        h.single_task([](){});
+                        h.single_task<class SingleTask1>([](){});
                      });
                 }
                 else
@@ -148,7 +153,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                     if (*j == 0)
                     {
                          return q.submit([=](sycl::handler& h){
-                            h.single_task([](){});
+                            h.single_task<class SingleTask2>([](){});
                          });
                     }
                     else
@@ -189,8 +194,11 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
     using my_policy_t = Policy;
 
     // they are cpus so this is ok
-    double* v = sycl::malloc_shared<double>(1000000, u[0]);
-    int* j = sycl::malloc_shared<int>(1, u[0]);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, double> dt_helper_v(u[0], 1000000);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, int> dt_helper_j(u[0], 1);
+
+    double* v = dt_helper_v.get_data();
+    int* j = dt_helper_j.get_data();
 
     my_policy_t p{u};
     auto n_samples = u.size();
@@ -238,7 +246,7 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
                 if (*j == 0)
                 {
                      return q.submit([=](sycl::handler& h){
-                        h.single_task([](){});
+                        h.single_task<class SingleTask3>([](){});
                      });
                 }
                 else
@@ -284,7 +292,7 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
                     if (*j == 0)
                     {
                          return q.submit([=](sycl::handler& h){
-                            h.single_task([](){});
+                            h.single_task<class SingleTask4>([](){});
                          });
                     }
                     else
@@ -326,8 +334,11 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
     using my_policy_t = Policy;
 
     // they are cpus so this is ok
-    double* v = sycl::malloc_shared<double>(1000000, u[0]);
-    int* j = sycl::malloc_shared<int>(1, u[0]);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, double> dt_helper_v(u[0], 1000000);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, int> dt_helper_j(u[0], 1);
+
+    double* v = dt_helper_v.get_data();
+    int* j = dt_helper_j.get_data();
 
     my_policy_t p{u};
     auto n_samples = u.size();
@@ -375,7 +386,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                 if (*j == 0)
                 {
                      return q.submit([=](sycl::handler& h){
-                        h.single_task([](){});
+                        h.single_task<class SingleTask5>([](){});
                      });
                 }
                 else
@@ -420,7 +431,7 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                     if (*j == 0)
                     {
                          return q.submit([=](sycl::handler& h){
-                            h.single_task([](){});
+                            h.single_task<class SingleTask6>([](){});
                          });
                     }
                     else

From 1e76278bd749d27012b35188df7bbd81336fd9c7 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 26 Apr 2024 18:36:53 +0200
Subject: [PATCH 19/30] Fix the format of report method - using
 `report_duration` in second param (#1539)

* include/oneapi/dpl/internal/dynamic_selection_traits.h - fix traits to specify report method second parameter type

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

* Change second parameter type of auto_tune_selection_type::report method - to report_duration (std::chrono::milliseconds)

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

---------

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dynamic_selection_impl/auto_tune_policy.h |  4 ++--
 .../dynamic_selection_impl/sycl_backend.h     | 16 +++++++--------
 .../dpl/internal/dynamic_selection_traits.h   | 20 +++++++++----------
 test/support/inline_backend.h                 | 12 ++++++-----
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 6688ffc6e02..a56260eaa22 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -174,9 +174,9 @@ class auto_tune_policy
         };
 
         void
-        report(const execution_info::task_time_t&, const typename execution_info::task_time_t::value_type& v) const
+        report(const execution_info::task_time_t&, report_duration v) const
         {
-            tuner_->add_new_timing(resource_, v);
+            tuner_->add_new_timing(resource_, v.count());
         }
     };
 
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index a053bdd5a7e..18f811fd764 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -72,12 +72,12 @@ class sycl_backend
 
         void
         report() override{
-            if constexpr (report_value_v<Selection, execution_info::task_time_t>){
+            if constexpr (report_value_v<Selection, execution_info::task_time_t, report_duration>){
                 if(s!=nullptr){
                     const auto time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
                     const auto time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
-                    const auto duration_in_ns = std::chrono::nanoseconds(time_end-time_start);
-                    s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(duration_in_ns).count());
+                    s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(
+                                                             std::chrono::nanoseconds(time_end - time_start)));
                 }
             }
 
@@ -172,10 +172,10 @@ class sycl_backend
             report(s, execution_info::task_submission);
         }
         if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t> ||
-                      report_value_v<SelectionHandle, execution_info::task_time_t>)
+                      report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
         {
             report_clock_type::time_point t0;
-            if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
+            if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
             {
                 if (!is_profiling_enabled)
                 {
@@ -193,7 +193,7 @@ class sycl_backend
                 });
                 waiter =  async_waiter{e2, std::make_shared<SelectionHandle>(s)};
             }
-            if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t>){
+            if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>){
                 if (is_profiling_enabled)
                 {
                     waiter = async_waiter{e1,std::make_shared<SelectionHandle>(s)};
@@ -203,8 +203,8 @@ class sycl_backend
                     auto e2 = q.submit([=](sycl::handler& h){
                         h.depends_on(e1);
                         h.host_task([=](){
-                            const auto tp_now = report_clock_type::now();
-                            s.report(execution_info::task_time, std::chrono::duration_cast<report_duration>(tp_now - t0).count());
+                            s.report(execution_info::task_time,
+                                     std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
                         });
                     });
                     waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index c1955a86c42..47e36c8cba4 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -108,16 +108,16 @@ struct has_report : decltype(has_report_impl<S, Info>(0))
 {
 };
 
-template <typename S, typename Info>
+template <typename S, typename Info, typename ValueType>
 auto
 has_report_value_impl(...) -> std::false_type;
 
-template <typename S, typename Info>
+template <typename S, typename Info, typename ValueType>
 auto
-has_report_value_impl(int) -> decltype(std::declval<S>().report(std::declval<Info>(), 0), std::true_type{});
+has_report_value_impl(int) -> decltype(std::declval<S>().report(std::declval<Info>(), std::declval<ValueType>()), std::true_type{});
 
-template <typename S, typename Info>
-struct has_report_value : decltype(has_report_value_impl<S, Info>(0))
+template <typename S, typename Info, typename ValueType>
+struct has_report_value : decltype(has_report_value_impl<S, Info, ValueType>(0))
 {
 };
 
@@ -277,7 +277,7 @@ template <typename S, typename Info, typename Value>
 void
 report(S&& s, const Info& i, const Value& v)
 {
-    if constexpr (internal::has_report_value<S, Info>::value)
+    if constexpr (internal::has_report_value<S, Info, Value>::value)
     {
         std::forward<S>(s).report(i, v);
     }
@@ -291,13 +291,13 @@ struct report_info
 template <typename S, typename Info>
 inline constexpr bool report_info_v = report_info<S, Info>::value;
 
-template <typename S, typename Info>
+template <typename S, typename Info, typename ValueType>
 struct report_value
 {
-    static constexpr bool value = internal::has_report_value<S, Info>::value;
+    static constexpr bool value = internal::has_report_value<S, Info, ValueType>::value;
 };
-template <typename S, typename Info>
-inline constexpr bool report_value_v = report_value<S, Info>::value;
+template <typename S, typename Info, typename ValueType>
+inline constexpr bool report_value_v = report_value<S, Info, ValueType>::value;
 
 } // namespace experimental
 } // namespace dpl
diff --git a/test/support/inline_backend.h b/test/support/inline_backend.h
index 301dfdf1315..24c0e7cfeaa 100644
--- a/test/support/inline_backend.h
+++ b/test/support/inline_backend.h
@@ -14,6 +14,7 @@
 
 #include <vector>
 #include <atomic>
+#include <chrono>
 
 namespace TestUtils
 {
@@ -50,6 +51,7 @@ class int_inline_backend_t
     using wait_type = int;
     using execution_resource_t = basic_execution_resource_t<resource_type>;
     using resource_container_t = std::vector<execution_resource_t>;
+    using report_duration = std::chrono::milliseconds;
 
   private:
     using native_resource_container_t = std::vector<resource_type>;
@@ -99,8 +101,8 @@ class int_inline_backend_t
     submit(SelectionHandle s, Function&& f, Args&&... args)
     {
         std::chrono::steady_clock::time_point t0;
-        if constexpr (oneapi::dpl::experimental::report_value_v<SelectionHandle,
-                                                                oneapi::dpl::experimental::execution_info::task_time_t>)
+        if constexpr (oneapi::dpl::experimental::report_value_v<
+                          SelectionHandle, oneapi::dpl::experimental::execution_info::task_time_t, report_duration>)
         {
             t0 = std::chrono::steady_clock::now();
         }
@@ -116,11 +118,11 @@ class int_inline_backend_t
         {
             oneapi::dpl::experimental::report(s, oneapi::dpl::experimental::execution_info::task_completion);
         }
-        if constexpr (oneapi::dpl::experimental::report_value_v<SelectionHandle,
-                                                                oneapi::dpl::experimental::execution_info::task_time_t>)
+        if constexpr (oneapi::dpl::experimental::report_value_v<
+                          SelectionHandle, oneapi::dpl::experimental::execution_info::task_time_t, report_duration>)
         {
             report(s, oneapi::dpl::experimental::execution_info::task_time,
-                   (std::chrono::steady_clock::now() - t0).count());
+                   std::chrono::duration_cast<report_duration>(std::chrono::steady_clock::now() - t0));
         }
         return async_waiter{w};
     }

From c217e1a37466afbf48f3b6bf014547ead92ef053 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 26 Apr 2024 12:26:43 -0500
Subject: [PATCH 20/30] Fixing clang format

---
 .../dynamic_selection_impl/auto_tune_policy.h |   3 +-
 .../dynamic_selection_impl/backend_traits.h   |  47 ++++----
 .../dynamic_load_policy.h                     |   3 +-
 .../dynamic_selection_impl/sycl_backend.h     | 106 +++++++++++-------
 .../dpl/internal/dynamic_selection_traits.h   |   3 +-
 5 files changed, 94 insertions(+), 68 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index a56260eaa22..1e6be9b1344 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -222,7 +222,8 @@ class auto_tune_policy
     select(Function&& f, Args&&... args)
     {
         static_assert(sizeof...(KeyArgs) == sizeof...(Args));
-        if constexpr(backend_traits::lazy_report_v<Backend>){
+        if constexpr (backend_traits::lazy_report_v<Backend>)
+        {
             backend_->lazy_report();
         }
         if (state_)
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
index b50a5b3a27c..5d085b24e4f 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
@@ -22,29 +22,30 @@ namespace experimental
 {
 namespace internal
 {
-    template <typename Backend>
-    auto
-    has_lazy_report_impl(...) -> std::false_type;
-
-    template <typename Backend>
-    auto
-    has_lazy_report_impl(int) -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
-
-    template <typename Backend>
-    struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
-    {
-    };
-
-}//namespace internal
-
-namespace backend_traits {
-    template <typename S>
-    struct lazy_report_value
-    {
-        static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report<S>::value;
-    };
-    template <typename S>
-    inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
+template <typename Backend>
+auto
+has_lazy_report_impl(...) -> std::false_type;
+
+template <typename Backend>
+auto
+has_lazy_report_impl(int) -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
+
+template <typename Backend>
+struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
+{
+};
+
+} //namespace internal
+
+namespace backend_traits
+{
+template <typename S>
+struct lazy_report_value
+{
+    static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report<S>::value;
+};
+template <typename S>
+inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
 
 } //namespace backend_traits
 
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
index 9a7e60cd6fb..d8d656a0594 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
@@ -150,7 +150,8 @@ struct dynamic_load_policy
     selection_type
     select(Args&&...)
     {
-        if constexpr(backend_traits::lazy_report_v<Backend>){
+        if constexpr (backend_traits::lazy_report_v<Backend>)
+        {
             backend_->lazy_report();
         }
         if (state_)
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 18f811fd764..a213fc03336 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -41,19 +41,24 @@ class sycl_backend
     using report_duration = std::chrono::milliseconds;
 
     static inline bool is_profiling_enabled = false;
+
   private:
-    class async_waiter_base{
-        public:
-            virtual void report() = 0;
-            virtual bool is_complete() = 0;
-            virtual ~async_waiter_base() = default;
+    class async_waiter_base
+    {
+      public:
+        virtual void
+        report() = 0;
+        virtual bool
+        is_complete() = 0;
+        virtual ~async_waiter_base() = default;
     };
 
-    template<typename Selection>
+    template <typename Selection>
     class async_waiter : public async_waiter_base
     {
         sycl::event e_;
         std::shared_ptr<Selection> s;
+
       public:
         async_waiter() = default;
         async_waiter(sycl::event e, std::shared_ptr<Selection> selection) : e_(e), s(selection) {}
@@ -71,53 +76,64 @@ class sycl_backend
         }
 
         void
-        report() override{
-            if constexpr (report_value_v<Selection, execution_info::task_time_t, report_duration>){
-                if(s!=nullptr){
-                    const auto time_start = e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
+        report() override
+        {
+            if constexpr (report_value_v<Selection, execution_info::task_time_t, report_duration>)
+            {
+                if (s != nullptr)
+                {
+                    const auto time_start =
+                        e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
                     const auto time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
                     s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(
                                                              std::chrono::nanoseconds(time_end - time_start)));
                 }
             }
-
-
         }
 
         bool
-        is_complete() override{
-            return e_.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete;
+        is_complete() override
+        {
+            return e_.get_info<sycl::info::event::command_execution_status>() ==
+                   sycl::info::event_command_status::complete;
         }
-
     };
 
-    struct async_waiter_list_t{
+    struct async_waiter_list_t
+    {
 
         std::mutex m_;
         std::vector<std::unique_ptr<async_waiter_base>> async_waiters;
 
-        void add_waiter(async_waiter_base *t){
+        void
+        add_waiter(async_waiter_base* t)
+        {
             std::lock_guard<std::mutex> l(m_);
             async_waiters.push_back(std::unique_ptr<async_waiter_base>(t));
         }
 
-        void lazy_report(){
-            if(is_profiling_enabled){
+        void
+        lazy_report()
+        {
+            if (is_profiling_enabled)
+            {
                 std::lock_guard<std::mutex> l(m_);
-                async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(), [](std::unique_ptr<async_waiter_base>& async_waiter){
-                        if(async_waiter->is_complete()){
-                            async_waiter->report();
-                            return true;
-                        }
-                        return false;
-                    }), async_waiters.end());
+                async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(),
+                                                   [](std::unique_ptr<async_waiter_base>& async_waiter) {
+                                                       if (async_waiter->is_complete())
+                                                       {
+                                                           async_waiter->report();
+                                                           return true;
+                                                       }
+                                                       return false;
+                                                   }),
+                                    async_waiters.end());
             }
         }
     };
 
     async_waiter_list_t async_waiter_list;
 
-
     class submission_group
     {
         resource_container_t resources_;
@@ -154,7 +170,8 @@ class sycl_backend
         for (auto e : v)
         {
             global_rank_.push_back(e);
-            if(!e.template has_property<sycl::property::queue::enable_profiling>()){
+            if (!e.template has_property<sycl::property::queue::enable_profiling>())
+            {
                 profiling = false;
             }
         }
@@ -184,25 +201,26 @@ class sycl_backend
             }
             async_waiter<SelectionHandle> waiter;
             auto e1 = f(q, std::forward<Args>(args)...);
-            if constexpr(report_info_v<SelectionHandle, execution_info::task_completion_t>){
-                auto e2 = q.submit([=](sycl::handler& h){
+            if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)
+            {
+                auto e2 = q.submit([=](sycl::handler& h) {
                     h.depends_on(e1);
-                    h.host_task([=](){
-                        s.report(execution_info::task_completion);
-                    });
+                    h.host_task([=]() { s.report(execution_info::task_completion); });
                 });
-                waiter =  async_waiter{e2, std::make_shared<SelectionHandle>(s)};
+                waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
             }
-            if constexpr(report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>){
+            if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
+            {
                 if (is_profiling_enabled)
                 {
-                    waiter = async_waiter{e1,std::make_shared<SelectionHandle>(s)};
+                    waiter = async_waiter{e1, std::make_shared<SelectionHandle>(s)};
                     async_waiter_list.add_waiter(new async_waiter(waiter));
                 }
-                else{
-                    auto e2 = q.submit([=](sycl::handler& h){
+                else
+                {
+                    auto e2 = q.submit([=](sycl::handler& h) {
                         h.depends_on(e1);
-                        h.host_task([=](){
+                        h.host_task([=]() {
                             s.report(execution_info::task_time,
                                      std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
                         });
@@ -230,7 +248,9 @@ class sycl_backend
         return global_rank_;
     }
 
-    void lazy_report(){
+    void
+    lazy_report()
+    {
         async_waiter_list.lazy_report();
     }
 
@@ -246,12 +266,14 @@ class sycl_backend
         auto devices = sycl::device::get_devices();
         for (auto& x : devices)
         {
-            if(!x.has(sycl::aspect::queue_profiling)){
+            if (!x.has(sycl::aspect::queue_profiling))
+            {
                 profiling = false;
             }
         }
         is_profiling_enabled = profiling;
-        if(is_profiling_enabled){
+        if (is_profiling_enabled)
+        {
             prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
         }
         for (auto& x : devices)
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index 47e36c8cba4..bd6f79ba898 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -114,7 +114,8 @@ has_report_value_impl(...) -> std::false_type;
 
 template <typename S, typename Info, typename ValueType>
 auto
-has_report_value_impl(int) -> decltype(std::declval<S>().report(std::declval<Info>(), std::declval<ValueType>()), std::true_type{});
+has_report_value_impl(int)
+    -> decltype(std::declval<S>().report(std::declval<Info>(), std::declval<ValueType>()), std::true_type{});
 
 template <typename S, typename Info, typename ValueType>
 struct has_report_value : decltype(has_report_value_impl<S, Info, ValueType>(0))

From 75dd2263e0a188f6df06f14b382c14bda836adac Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 26 Apr 2024 15:11:15 -0500
Subject: [PATCH 21/30] Adding header file for backend_traits

---
 .../dpl/internal/dynamic_selection_impl/dynamic_load_policy.h    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
index d8d656a0594..c34b92e1956 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
@@ -19,6 +19,7 @@
 #include <type_traits>
 #include <utility>
 #include "oneapi/dpl/internal/dynamic_selection_traits.h"
+#include "oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h"
 #if _DS_BACKEND_SYCL != 0
 #    include "oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h"
 #endif

From bc95cd296be548ec17d02840f8fd5b695539b7ae Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 26 Apr 2024 16:21:53 -0500
Subject: [PATCH 22/30] Changed structure for sycl_backend submit

---
 .../dynamic_selection_impl/sycl_backend.h     | 45 +++++++++++--------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index a213fc03336..292b87cb4f9 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -199,34 +199,41 @@ class sycl_backend
                     t0 = report_clock_type::now();
                 }
             }
-            async_waiter<SelectionHandle> waiter;
             auto e1 = f(q, std::forward<Args>(args)...);
-            if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)
-            {
-                auto e2 = q.submit([=](sycl::handler& h) {
-                    h.depends_on(e1);
-                    h.host_task([=]() { s.report(execution_info::task_completion); });
-                });
-                waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
-            }
+            async_waiter<SelectionHandle> waiter{e1, std::make_shared<SelectionHandle>(s)};
+
+
             if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
             {
                 if (is_profiling_enabled)
                 {
-                    waiter = async_waiter{e1, std::make_shared<SelectionHandle>(s)};
                     async_waiter_list.add_waiter(new async_waiter(waiter));
                 }
-                else
-                {
-                    auto e2 = q.submit([=](sycl::handler& h) {
-                        h.depends_on(e1);
-                        h.host_task([=]() {
-                            s.report(execution_info::task_time,
+            }
+
+            bool is_host_task_needed = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration> && !is_profiling_enabled
+                                       || report_info_v<SelectionHandle, execution_info::task_completion_t>;
+
+            if (is_host_task_needed)
+            {
+                auto e2 = q.submit([=](sycl::handler& h) {
+                    h.depends_on(e1);
+                    h.host_task([=]() {
+                        if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
+                        {
+                            if(!is_profiling_enabled)
+                            {
+                                s.report(execution_info::task_time,
                                      std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
-                        });
+                            }
+                        }
+                        if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)
+                        {
+                            s.report(execution_info::task_completion);
+                        }
                     });
-                    waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
-                }
+                });
+                waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
             }
             return waiter;
         }

From 19e78e711b8794169661acfae0770bcc66a9e4e1 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Fri, 26 Apr 2024 16:59:25 -0500
Subject: [PATCH 23/30] Fixed clang format

---
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 292b87cb4f9..49fe0ac6fef 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -202,7 +202,6 @@ class sycl_backend
             auto e1 = f(q, std::forward<Args>(args)...);
             async_waiter<SelectionHandle> waiter{e1, std::make_shared<SelectionHandle>(s)};
 
-
             if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
             {
                 if (is_profiling_enabled)
@@ -211,8 +210,9 @@ class sycl_backend
                 }
             }
 
-            bool is_host_task_needed = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration> && !is_profiling_enabled
-                                       || report_info_v<SelectionHandle, execution_info::task_completion_t>;
+            bool is_host_task_needed = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration> &&
+                                           !is_profiling_enabled ||
+                                       report_info_v<SelectionHandle, execution_info::task_completion_t>;
 
             if (is_host_task_needed)
             {
@@ -221,10 +221,10 @@ class sycl_backend
                     h.host_task([=]() {
                         if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
                         {
-                            if(!is_profiling_enabled)
+                            if (!is_profiling_enabled)
                             {
                                 s.report(execution_info::task_time,
-                                     std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
+                                         std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
                             }
                         }
                         if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)

From 3d3dab2968b7efa950c7a037b501f582155ababb Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Mon, 29 Apr 2024 11:28:24 -0500
Subject: [PATCH 24/30] Changes to make variables is_profiling_enabled and
 aliases for report function private

---
 .../dpl/internal/dynamic_selection_impl/backend_traits.h   | 1 -
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h     | 7 ++++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
index 5d085b24e4f..26ce70171f7 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
@@ -11,7 +11,6 @@
 #define _ONEDPL_INTERNAL_BACKEND_TRAITS_H
 
 #include <utility>
-#include <cstdint>
 #include <type_traits>
 
 namespace oneapi
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 49fe0ac6fef..e83823a94ac 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -37,12 +37,13 @@ class sycl_backend
     using execution_resource_t = resource_type;
     using resource_container_t = std::vector<execution_resource_t>;
 
-    using report_clock_type = std::chrono::steady_clock;
-    using report_duration = std::chrono::milliseconds;
+
+  private:
 
     static inline bool is_profiling_enabled = false;
+    using report_clock_type = std::chrono::steady_clock;
+    using report_duration = std::chrono::milliseconds;
 
-  private:
     class async_waiter_base
     {
       public:

From 80096690b1f6e45b7f9e19f05a55f890ac8b89b1 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Mon, 29 Apr 2024 13:40:09 -0500
Subject: [PATCH 25/30] Moved is_profiling_enabled outside the async_waiter
 class

---
 .../dynamic_selection_impl/sycl_backend.h     | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index e83823a94ac..5c376251c46 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -37,9 +37,7 @@ class sycl_backend
     using execution_resource_t = resource_type;
     using resource_container_t = std::vector<execution_resource_t>;
 
-
   private:
-
     static inline bool is_profiling_enabled = false;
     using report_clock_type = std::chrono::steady_clock;
     using report_duration = std::chrono::milliseconds;
@@ -116,20 +114,18 @@ class sycl_backend
         void
         lazy_report()
         {
-            if (is_profiling_enabled)
-            {
-                std::lock_guard<std::mutex> l(m_);
-                async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(),
-                                                   [](std::unique_ptr<async_waiter_base>& async_waiter) {
-                                                       if (async_waiter->is_complete())
-                                                       {
-                                                           async_waiter->report();
-                                                           return true;
-                                                       }
-                                                       return false;
-                                                   }),
-                                    async_waiters.end());
-            }
+            std::lock_guard<std::mutex> l(m_);
+            async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(),
+                                               [](std::unique_ptr<async_waiter_base>& async_waiter) {
+                                                   if (async_waiter->is_complete())
+                                                   {
+                                                       async_waiter->report();
+                                                       return true;
+                                                   }
+                                                   return false;
+                                               }),
+                                async_waiters.end());
+
         }
     };
 
@@ -259,7 +255,9 @@ class sycl_backend
     void
     lazy_report()
     {
-        async_waiter_list.lazy_report();
+        if (is_profiling_enabled){
+            async_waiter_list.lazy_report();
+        }
     }
 
   private:

From ae46b12a943803b2083464ca8ba78c894c7ff559 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Mon, 29 Apr 2024 14:51:49 -0500
Subject: [PATCH 26/30] Making sycl backend submit function more readable

---
 .../dynamic_selection_impl/sycl_backend.h     | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 5c376251c46..54e79e5abf9 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -180,16 +180,19 @@ class sycl_backend
     auto
     submit(SelectionHandle s, Function&& f, Args&&... args)
     {
+        constexpr bool report_task_completion = report_info_v<SelectionHandle, execution_info::task_completion_t>;
+        constexpr bool report_task_submission = report_info_v<SelectionHandle, execution_info::task_submission_t>;
+        constexpr bool  report_task_time = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>;
+
         auto q = unwrap(s);
-        if constexpr (report_info_v<SelectionHandle, execution_info::task_submission_t>)
+        if constexpr (report_task_submission)
         {
             report(s, execution_info::task_submission);
         }
-        if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t> ||
-                      report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
+        if constexpr (report_task_completion || report_task_time)
         {
             report_clock_type::time_point t0;
-            if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
+            if constexpr (report_task_time)
             {
                 if (!is_profiling_enabled)
                 {
@@ -199,7 +202,7 @@ class sycl_backend
             auto e1 = f(q, std::forward<Args>(args)...);
             async_waiter<SelectionHandle> waiter{e1, std::make_shared<SelectionHandle>(s)};
 
-            if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
+            if constexpr (report_task_time)
             {
                 if (is_profiling_enabled)
                 {
@@ -207,16 +210,14 @@ class sycl_backend
                 }
             }
 
-            bool is_host_task_needed = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration> &&
-                                           !is_profiling_enabled ||
-                                       report_info_v<SelectionHandle, execution_info::task_completion_t>;
+            bool is_host_task_needed = report_task_time && !is_profiling_enabled || report_task_completion;
 
             if (is_host_task_needed)
             {
                 auto e2 = q.submit([=](sycl::handler& h) {
                     h.depends_on(e1);
                     h.host_task([=]() {
-                        if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>)
+                        if constexpr (report_task_time)
                         {
                             if (!is_profiling_enabled)
                             {
@@ -224,7 +225,7 @@ class sycl_backend
                                          std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
                             }
                         }
-                        if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)
+                        if constexpr (report_task_completion)
                         {
                             s.report(execution_info::task_completion);
                         }

From 0cf0daab98663ccf69998806363193bfc01a15d0 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Mon, 29 Apr 2024 16:00:25 -0500
Subject: [PATCH 27/30] Fixed clang format

---
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 54e79e5abf9..59637b8d24c 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -125,7 +125,6 @@ class sycl_backend
                                                    return false;
                                                }),
                                 async_waiters.end());
-
         }
     };
 
@@ -182,7 +181,7 @@ class sycl_backend
     {
         constexpr bool report_task_completion = report_info_v<SelectionHandle, execution_info::task_completion_t>;
         constexpr bool report_task_submission = report_info_v<SelectionHandle, execution_info::task_submission_t>;
-        constexpr bool  report_task_time = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>;
+        constexpr bool report_task_time = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>;
 
         auto q = unwrap(s);
         if constexpr (report_task_submission)
@@ -256,7 +255,8 @@ class sycl_backend
     void
     lazy_report()
     {
-        if (is_profiling_enabled){
+        if (is_profiling_enabled)
+        {
             async_waiter_list.lazy_report();
         }
     }

From 5028e631e4640528392cb8ce8f92cf7727dbf026 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 30 Apr 2024 17:30:21 +0200
Subject: [PATCH 28/30] Simplify sycl_backend::submit(SelectionHandle s,
 Function&& f, Args&&... args) function (#1547)

* include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h - simplify sycl_backend::submit(SelectionHandle s, Function&& f, Args&&... args) function

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

* include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h - remove not required type name

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

---------

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dynamic_selection_impl/sycl_backend.h     | 30 +++++--------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 59637b8d24c..a99d4bcb3a0 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -184,34 +184,24 @@ class sycl_backend
         constexpr bool report_task_time = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>;
 
         auto q = unwrap(s);
+
         if constexpr (report_task_submission)
-        {
             report(s, execution_info::task_submission);
-        }
+
         if constexpr (report_task_completion || report_task_time)
         {
-            report_clock_type::time_point t0;
-            if constexpr (report_task_time)
-            {
-                if (!is_profiling_enabled)
-                {
-                    t0 = report_clock_type::now();
-                }
-            }
+            const auto t0 = report_clock_type::now();
+
             auto e1 = f(q, std::forward<Args>(args)...);
             async_waiter<SelectionHandle> waiter{e1, std::make_shared<SelectionHandle>(s)};
 
             if constexpr (report_task_time)
             {
                 if (is_profiling_enabled)
-                {
                     async_waiter_list.add_waiter(new async_waiter(waiter));
-                }
             }
 
-            bool is_host_task_needed = report_task_time && !is_profiling_enabled || report_task_completion;
-
-            if (is_host_task_needed)
+            if (report_task_time && !is_profiling_enabled || report_task_completion)
             {
                 auto e2 = q.submit([=](sycl::handler& h) {
                     h.depends_on(e1);
@@ -219,25 +209,19 @@ class sycl_backend
                         if constexpr (report_task_time)
                         {
                             if (!is_profiling_enabled)
-                            {
                                 s.report(execution_info::task_time,
                                          std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
-                            }
                         }
                         if constexpr (report_task_completion)
-                        {
                             s.report(execution_info::task_completion);
-                        }
                     });
                 });
                 waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
             }
             return waiter;
         }
-        else
-        {
-            return async_waiter{f(q, std::forward<Args>(args)...), std::make_shared<SelectionHandle>(s)};
-        }
+
+        return async_waiter{f(q, std::forward<Args>(args)...), std::make_shared<SelectionHandle>(s)};
     }
 
     auto

From aa07b29f8184dbc2723c76b4fb5b6b7adb764d8f Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Tue, 30 Apr 2024 10:41:37 -0500
Subject: [PATCH 29/30] Minor changes to auto_tune_policy.h and sycl_backend.h

---
 .../internal/dynamic_selection_impl/auto_tune_policy.h    | 2 +-
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index 1e6be9b1344..9fc901977ac 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -58,7 +58,7 @@ class auto_tune_policy
     struct resource_with_index_t
     {
         wrapped_resource_t r_;
-        size_type index_;
+        size_type index_ = 0;
     };
 
     struct time_data_t
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index a99d4bcb3a0..ae5d5a1e008 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -46,9 +46,9 @@ class sycl_backend
     {
       public:
         virtual void
-        report() = 0;
+        report() const = 0;
         virtual bool
-        is_complete() = 0;
+        is_complete() const = 0;
         virtual ~async_waiter_base() = default;
     };
 
@@ -75,7 +75,7 @@ class sycl_backend
         }
 
         void
-        report() override
+        report() const override
         {
             if constexpr (report_value_v<Selection, execution_info::task_time_t, report_duration>)
             {
@@ -91,7 +91,7 @@ class sycl_backend
         }
 
         bool
-        is_complete() override
+        is_complete() const override
         {
             return e_.get_info<sycl::info::event::command_execution_status>() ==
                    sycl::info::event_command_status::complete;

From 797ea84356c4b462e503b55064be93a24c23befc Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Tue, 30 Apr 2024 10:48:41 -0500
Subject: [PATCH 30/30] Improving readability

---
 .../dpl/internal/dynamic_selection_impl/sycl_backend.h      | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index ae5d5a1e008..52df9966946 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -45,10 +45,8 @@ class sycl_backend
     class async_waiter_base
     {
       public:
-        virtual void
-        report() const = 0;
-        virtual bool
-        is_complete() const = 0;
+        virtual void report() const = 0;
+        virtual bool is_complete() const = 0;
         virtual ~async_waiter_base() = default;
     };