From 5ea755c7d984811693a3d61b10885513895e3086 Mon Sep 17 00:00:00 2001
From: Anuya Welling <anuya.welling@intel.com>
Date: Tue, 30 Apr 2024 14:08:53 -0500
Subject: [PATCH] [Dynamic Selection] Adding sycl profiling for auto tune
 policy (#1464)

* Added static assert for keyArgs==Args

* WIP for jit compilation

* Fixing Autotune bug

* Added sycl profiling without host task

* Corrections in dynamic traits

* Remove comments in dynamic_selection_traits

* Adding back jit compolation restrictions

* Adressing comments to add thread safety, better traits

* Added backend trait to check if profiling is enabled

* Backend traits, renaming and adding a lock to lazy report

* Fixed memory leaks

* No nullptr for selection handle in async waiter constructor

* Added an erase and remove_if

* Adressing comments for profiling report, Adding profiling to default sycl backend constructor

* Changes to std::chrono::duration

* Addressed comments for sycl backend

* Addressed comments to sycl backend

* Fix USM memory leaks and create unique task names (#1537)

* test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp - fix USM shared memory leaks

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

* test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp - create unique task names for h.single_task([](){});

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

---------

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

* Fix the format of report method - using `report_duration` in second param (#1539)

* include/oneapi/dpl/internal/dynamic_selection_traits.h - fix traits to specify report method second parameter type

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

* Change second parameter type of auto_tune_selection_type::report method - to report_duration (std::chrono::milliseconds)

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

---------

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>

* Fixing clang format

---------

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
Co-authored-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dynamic_selection_impl/auto_tune_policy.h |  23 ++-
 .../dynamic_selection_impl/backend_traits.h   |  55 ++++++
 .../dynamic_load_policy.h                     |   5 +
 .../dynamic_selection_impl/sycl_backend.h     | 171 +++++++++++++++---
 .../dpl/internal/dynamic_selection_traits.h   |  21 ++-
 .../sycl/test_auto_tune_policy_sycl.pass.cpp  | 163 +++++++++++------
 test/support/inline_backend.h                 |  12 +-
 7 files changed, 347 insertions(+), 103 deletions(-)
 create mode 100644 include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
index bfeaf5cd08a..9fc901977ac 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/auto_tune_policy.h
@@ -16,12 +16,14 @@
 #include <mutex>
 #include <utility>
 #include <chrono>
+#include <ratio>
 #include <limits>
 #include <vector>
 #include <type_traits>
 #include <tuple>
 #include <unordered_map>
 #include "oneapi/dpl/internal/dynamic_selection_traits.h"
+#include "oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h"
 #if _DS_BACKEND_SYCL != 0
 #    include "oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h"
 #endif
@@ -47,13 +49,16 @@ class auto_tune_policy
     using size_type = typename std::vector<typename Backend::resource_type>::size_type;
     using timing_t = uint64_t;
 
+    using report_clock_type = std::chrono::steady_clock;
+    using report_duration = std::chrono::milliseconds;
+
     static constexpr timing_t never_resample = 0;
     static constexpr size_type use_best_resource = ~size_type(0);
 
     struct resource_with_index_t
     {
         wrapped_resource_t r_;
-        size_type index_;
+        size_type index_ = 0;
     };
 
     struct time_data_t
@@ -66,7 +71,7 @@ class auto_tune_policy
     {
         std::mutex m_;
 
-        std::chrono::steady_clock::time_point t0_;
+        report_clock_type::time_point t0_;
 
         timing_t best_timing_ = std::numeric_limits<timing_t>::max();
         resource_with_index_t best_resource_;
@@ -80,7 +85,7 @@ class auto_tune_policy
         timing_t resample_time_ = 0.0;
 
         tuner_t(resource_with_index_t br, size_type resources_size, timing_t rt)
-            : t0_(std::chrono::steady_clock::now()), best_resource_(br), max_resource_to_profile_(resources_size),
+            : t0_(report_clock_type::now()), best_resource_(br), max_resource_to_profile_(resources_size),
               resample_time_(rt)
         {
         }
@@ -100,8 +105,8 @@ class auto_tune_policy
             }
             else
             {
-                auto now = std::chrono::steady_clock::now();
-                auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - t0_).count();
+                const auto now = report_clock_type::now();
+                const auto ms = std::chrono::duration_cast<report_duration>(now - t0_).count();
                 if (ms < resample_time_)
                 {
                     return use_best_resource;
@@ -169,9 +174,9 @@ class auto_tune_policy
         };
 
         void
-        report(const execution_info::task_time_t&, const typename execution_info::task_time_t::value_type& v) const
+        report(const execution_info::task_time_t&, report_duration v) const
         {
-            tuner_->add_new_timing(resource_, v);
+            tuner_->add_new_timing(resource_, v.count());
         }
     };
 
@@ -217,6 +222,10 @@ class auto_tune_policy
     select(Function&& f, Args&&... args)
     {
         static_assert(sizeof...(KeyArgs) == sizeof...(Args));
+        if constexpr (backend_traits::lazy_report_v<Backend>)
+        {
+            backend_->lazy_report();
+        }
         if (state_)
         {
             std::lock_guard<std::mutex> l(state_->m_);
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
new file mode 100644
index 00000000000..26ce70171f7
--- /dev/null
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h
@@ -0,0 +1,55 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_INTERNAL_BACKEND_TRAITS_H
+#define _ONEDPL_INTERNAL_BACKEND_TRAITS_H
+
+#include <utility>
+#include <type_traits>
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace experimental
+{
+namespace internal
+{
+template <typename Backend>
+auto
+has_lazy_report_impl(...) -> std::false_type;
+
+template <typename Backend>
+auto
+has_lazy_report_impl(int) -> decltype(std::declval<Backend>().lazy_report(), std::true_type{});
+
+template <typename Backend>
+struct has_lazy_report : decltype(has_lazy_report_impl<Backend>(0))
+{
+};
+
+} //namespace internal
+
+namespace backend_traits
+{
+template <typename S>
+struct lazy_report_value
+{
+    static constexpr bool value = ::oneapi::dpl::experimental::internal::has_lazy_report<S>::value;
+};
+template <typename S>
+inline constexpr bool lazy_report_v = lazy_report_value<S>::value;
+
+} //namespace backend_traits
+
+} // namespace experimental
+} // namespace dpl
+} // namespace oneapi
+
+#endif /*_ONEDPL_INTERNAL_BACKEND_TRAITS_H*/
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
index d865b9b7fba..c34b92e1956 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/dynamic_load_policy.h
@@ -19,6 +19,7 @@
 #include <type_traits>
 #include <utility>
 #include "oneapi/dpl/internal/dynamic_selection_traits.h"
+#include "oneapi/dpl/internal/dynamic_selection_impl/backend_traits.h"
 #if _DS_BACKEND_SYCL != 0
 #    include "oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h"
 #endif
@@ -150,6 +151,10 @@ struct dynamic_load_policy
     selection_type
     select(Args&&...)
     {
+        if constexpr (backend_traits::lazy_report_v<Backend>)
+        {
+            backend_->lazy_report();
+        }
         if (state_)
         {
             std::lock_guard<std::mutex> l(state_->m_);
diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
index 25921bdf819..52df9966946 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
@@ -16,9 +16,11 @@
 #include "oneapi/dpl/internal/dynamic_selection_impl/scoring_policy_defs.h"
 
 #include <chrono>
+#include <ratio>
 #include <vector>
 #include <memory>
 #include <utility>
+#include <algorithm>
 
 namespace oneapi
 {
@@ -36,24 +38,96 @@ class sycl_backend
     using resource_container_t = std::vector<execution_resource_t>;
 
   private:
-    class async_waiter
+    static inline bool is_profiling_enabled = false;
+    using report_clock_type = std::chrono::steady_clock;
+    using report_duration = std::chrono::milliseconds;
+
+    class async_waiter_base
+    {
+      public:
+        virtual void report() const = 0;
+        virtual bool is_complete() const = 0;
+        virtual ~async_waiter_base() = default;
+    };
+
+    template <typename Selection>
+    class async_waiter : public async_waiter_base
     {
         sycl::event e_;
+        std::shared_ptr<Selection> s;
 
       public:
-        async_waiter(const sycl::event& e) : e_(e) {}
+        async_waiter() = default;
+        async_waiter(sycl::event e, std::shared_ptr<Selection> selection) : e_(e), s(selection) {}
+
         sycl::event
         unwrap()
         {
             return e_;
         }
+
         void
         wait()
         {
             e_.wait();
         }
+
+        void
+        report() const override
+        {
+            if constexpr (report_value_v<Selection, execution_info::task_time_t, report_duration>)
+            {
+                if (s != nullptr)
+                {
+                    const auto time_start =
+                        e_.template get_profiling_info<sycl::info::event_profiling::command_start>();
+                    const auto time_end = e_.template get_profiling_info<sycl::info::event_profiling::command_end>();
+                    s->report(execution_info::task_time, std::chrono::duration_cast<report_duration>(
+                                                             std::chrono::nanoseconds(time_end - time_start)));
+                }
+            }
+        }
+
+        bool
+        is_complete() const override
+        {
+            return e_.get_info<sycl::info::event::command_execution_status>() ==
+                   sycl::info::event_command_status::complete;
+        }
+    };
+
+    struct async_waiter_list_t
+    {
+
+        std::mutex m_;
+        std::vector<std::unique_ptr<async_waiter_base>> async_waiters;
+
+        void
+        add_waiter(async_waiter_base* t)
+        {
+            std::lock_guard<std::mutex> l(m_);
+            async_waiters.push_back(std::unique_ptr<async_waiter_base>(t));
+        }
+
+        void
+        lazy_report()
+        {
+            std::lock_guard<std::mutex> l(m_);
+            async_waiters.erase(std::remove_if(async_waiters.begin(), async_waiters.end(),
+                                               [](std::unique_ptr<async_waiter_base>& async_waiter) {
+                                                   if (async_waiter->is_complete())
+                                                   {
+                                                       async_waiter->report();
+                                                       return true;
+                                                   }
+                                                   return false;
+                                               }),
+                                async_waiters.end());
+        }
     };
 
+    async_waiter_list_t async_waiter_list;
+
     class submission_group
     {
         resource_container_t resources_;
@@ -85,11 +159,17 @@ class sycl_backend
     template <typename NativeUniverseVector>
     sycl_backend(const NativeUniverseVector& v)
     {
+        bool profiling = true;
         global_rank_.reserve(v.size());
         for (auto e : v)
         {
             global_rank_.push_back(e);
+            if (!e.template has_property<sycl::property::queue::enable_profiling>())
+            {
+                profiling = false;
+            }
         }
+        is_profiling_enabled = profiling;
         sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
     }
 
@@ -97,37 +177,49 @@ class sycl_backend
     auto
     submit(SelectionHandle s, Function&& f, Args&&... args)
     {
+        constexpr bool report_task_completion = report_info_v<SelectionHandle, execution_info::task_completion_t>;
+        constexpr bool report_task_submission = report_info_v<SelectionHandle, execution_info::task_submission_t>;
+        constexpr bool report_task_time = report_value_v<SelectionHandle, execution_info::task_time_t, report_duration>;
+
         auto q = unwrap(s);
-        if constexpr (report_info_v<SelectionHandle, execution_info::task_submission_t>)
-        {
+
+        if constexpr (report_task_submission)
             report(s, execution_info::task_submission);
-        }
-        if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t> ||
-                      report_value_v<SelectionHandle, execution_info::task_time_t>)
+
+        if constexpr (report_task_completion || report_task_time)
         {
-            std::chrono::steady_clock::time_point t0;
-            if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
+            const auto t0 = report_clock_type::now();
+
+            auto e1 = f(q, std::forward<Args>(args)...);
+            async_waiter<SelectionHandle> waiter{e1, std::make_shared<SelectionHandle>(s)};
+
+            if constexpr (report_task_time)
             {
-                t0 = std::chrono::steady_clock::now();
+                if (is_profiling_enabled)
+                    async_waiter_list.add_waiter(new async_waiter(waiter));
             }
-            auto e1 = f(q, std::forward<Args>(args)...);
-            auto e2 = q.submit([=](sycl::handler& h) {
-                h.depends_on(e1);
-                h.host_task([=]() {
-                    if constexpr (report_value_v<SelectionHandle, execution_info::task_time_t>)
-                        s.report(execution_info::task_time, (std::chrono::steady_clock::now() - t0).count());
-                    if constexpr (report_info_v<SelectionHandle, execution_info::task_completion_t>)
-                    {
-                        s.report(execution_info::task_completion);
-                    }
+
+            if (report_task_time && !is_profiling_enabled || report_task_completion)
+            {
+                auto e2 = q.submit([=](sycl::handler& h) {
+                    h.depends_on(e1);
+                    h.host_task([=]() {
+                        if constexpr (report_task_time)
+                        {
+                            if (!is_profiling_enabled)
+                                s.report(execution_info::task_time,
+                                         std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
+                        }
+                        if constexpr (report_task_completion)
+                            s.report(execution_info::task_completion);
+                    });
                 });
-            });
-            return async_waiter{e2};
-        }
-        else
-        {
-            return async_waiter{f(unwrap(s), std::forward<Args>(args)...)};
+                waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
+            }
+            return waiter;
         }
+
+        return async_waiter{f(q, std::forward<Args>(args)...), std::make_shared<SelectionHandle>(s)};
     }
 
     auto
@@ -142,6 +234,15 @@ class sycl_backend
         return global_rank_;
     }
 
+    void
+    lazy_report()
+    {
+        if (is_profiling_enabled)
+        {
+            async_waiter_list.lazy_report();
+        }
+    }
+
   private:
     resource_container_t global_rank_;
     std::unique_ptr<submission_group> sgroup_ptr_;
@@ -149,10 +250,24 @@ class sycl_backend
     void
     initialize_default_resources()
     {
+        bool profiling = true;
+        auto prop_list = sycl::property_list{};
         auto devices = sycl::device::get_devices();
-        for (auto x : devices)
+        for (auto& x : devices)
+        {
+            if (!x.has(sycl::aspect::queue_profiling))
+            {
+                profiling = false;
+            }
+        }
+        is_profiling_enabled = profiling;
+        if (is_profiling_enabled)
+        {
+            prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
+        }
+        for (auto& x : devices)
         {
-            global_rank_.push_back(sycl::queue{x});
+            global_rank_.push_back(sycl::queue{x, prop_list});
         }
     }
 };
diff --git a/include/oneapi/dpl/internal/dynamic_selection_traits.h b/include/oneapi/dpl/internal/dynamic_selection_traits.h
index c1955a86c42..bd6f79ba898 100644
--- a/include/oneapi/dpl/internal/dynamic_selection_traits.h
+++ b/include/oneapi/dpl/internal/dynamic_selection_traits.h
@@ -108,16 +108,17 @@ struct has_report : decltype(has_report_impl<S, Info>(0))
 {
 };
 
-template <typename S, typename Info>
+template <typename S, typename Info, typename ValueType>
 auto
 has_report_value_impl(...) -> std::false_type;
 
-template <typename S, typename Info>
+template <typename S, typename Info, typename ValueType>
 auto
-has_report_value_impl(int) -> decltype(std::declval<S>().report(std::declval<Info>(), 0), std::true_type{});
+has_report_value_impl(int)
+    -> decltype(std::declval<S>().report(std::declval<Info>(), std::declval<ValueType>()), std::true_type{});
 
-template <typename S, typename Info>
-struct has_report_value : decltype(has_report_value_impl<S, Info>(0))
+template <typename S, typename Info, typename ValueType>
+struct has_report_value : decltype(has_report_value_impl<S, Info, ValueType>(0))
 {
 };
 
@@ -277,7 +278,7 @@ template <typename S, typename Info, typename Value>
 void
 report(S&& s, const Info& i, const Value& v)
 {
-    if constexpr (internal::has_report_value<S, Info>::value)
+    if constexpr (internal::has_report_value<S, Info, Value>::value)
     {
         std::forward<S>(s).report(i, v);
     }
@@ -291,13 +292,13 @@ struct report_info
 template <typename S, typename Info>
 inline constexpr bool report_info_v = report_info<S, Info>::value;
 
-template <typename S, typename Info>
+template <typename S, typename Info, typename ValueType>
 struct report_value
 {
-    static constexpr bool value = internal::has_report_value<S, Info>::value;
+    static constexpr bool value = internal::has_report_value<S, Info, ValueType>::value;
 };
-template <typename S, typename Info>
-inline constexpr bool report_value_v = report_value<S, Info>::value;
+template <typename S, typename Info, typename ValueType>
+inline constexpr bool report_value_v = report_value<S, Info, ValueType>::value;
 
 } // namespace experimental
 } // namespace dpl
diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
index e24c4c00522..d6e21e7db80 100644
--- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
+++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp
@@ -14,6 +14,8 @@
 #include <thread>
 #include "support/test_dynamic_selection_utils.h"
 #include "support/utils.h"
+#include "support/sycl_alloc_utils.h"
+
 #if TEST_DYNAMIC_SELECTION_AVAILABLE
 
 int
@@ -53,8 +55,11 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
     using my_policy_t = Policy;
 
     // they are cpus so this is ok
-    double* v = sycl::malloc_shared<double>(1000000, u[0]);
-    int* j = sycl::malloc_shared<int>(1, u[0]);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, double> dt_helper_v(u[0], 1000000);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, int> dt_helper_j(u[0], 1);
+
+    double* v = dt_helper_v.get_data();
+    int* j = dt_helper_j.get_data();
 
     my_policy_t p{u};
     auto n_samples = u.size();
@@ -101,7 +106,9 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                 ecount += i;
                 if (*j == 0)
                 {
-                    return sycl::event{};
+                     return q.submit([=](sycl::handler& h){
+                        h.single_task<class SingleTask1>([](){});
+                     });
                 }
                 else
                 {
@@ -145,7 +152,9 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
                     ecount += i;
                     if (*j == 0)
                     {
-                        return sycl::event{};
+                         return q.submit([=](sycl::handler& h){
+                            h.single_task<class SingleTask2>([](){});
+                         });
                     }
                     else
                     {
@@ -185,8 +194,11 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
     using my_policy_t = Policy;
 
     // they are cpus so this is ok
-    double* v = sycl::malloc_shared<double>(1000000, u[0]);
-    int* j = sycl::malloc_shared<int>(1, u[0]);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, double> dt_helper_v(u[0], 1000000);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, int> dt_helper_j(u[0], 1);
+
+    double* v = dt_helper_v.get_data();
+    int* j = dt_helper_j.get_data();
 
     my_policy_t p{u};
     auto n_samples = u.size();
@@ -233,7 +245,9 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
                 ecount += i;
                 if (*j == 0)
                 {
-                    return sycl::event{};
+                     return q.submit([=](sycl::handler& h){
+                        h.single_task<class SingleTask3>([](){});
+                     });
                 }
                 else
                 {
@@ -277,7 +291,9 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
                     ecount += i;
                     if (*j == 0)
                     {
-                        return sycl::event{};
+                         return q.submit([=](sycl::handler& h){
+                            h.single_task<class SingleTask4>([](){});
+                         });
                     }
                     else
                     {
@@ -310,6 +326,7 @@ test_auto_submit_wait_on_group(UniverseContainer u, int best_resource)
     return 0;
 }
 
+
 template <bool call_select_before_submit, typename Policy, typename UniverseContainer>
 int
 test_auto_submit_and_wait(UniverseContainer u, int best_resource)
@@ -317,8 +334,11 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
     using my_policy_t = Policy;
 
     // they are cpus so this is ok
-    double* v = sycl::malloc_shared<double>(1000000, u[0]);
-    int* j = sycl::malloc_shared<int>(1, u[0]);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, double> dt_helper_v(u[0], 1000000);
+    TestUtils::usm_data_transfer<sycl::usm::alloc::shared, int> dt_helper_j(u[0], 1);
+
+    double* v = dt_helper_v.get_data();
+    int* j = dt_helper_j.get_data();
 
     my_policy_t p{u};
     auto n_samples = u.size();
@@ -365,7 +385,9 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                 ecount += i;
                 if (*j == 0)
                 {
-                    return sycl::event{};
+                     return q.submit([=](sycl::handler& h){
+                        h.single_task<class SingleTask5>([](){});
+                     });
                 }
                 else
                 {
@@ -408,7 +430,9 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
                     ecount += i;
                     if (*j == 0)
                     {
-                        return sycl::event{};
+                         return q.submit([=](sycl::handler& h){
+                            h.single_task<class SingleTask6>([](){});
+                         });
                     }
                     else
                     {
@@ -440,13 +464,20 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
     return 0;
 }
 
+
+template<bool use_event_profiling=false>
 static inline void
 build_auto_tune_universe(std::vector<sycl::queue>& u)
 {
+    auto prop_list = sycl::property_list{};
+    if(use_event_profiling){
+        prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
+    }
+
     try
     {
         auto device_cpu1 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu1_queue(device_cpu1);
+        sycl::queue cpu1_queue{device_cpu1, prop_list};
         u.push_back(cpu1_queue);
     }
     catch (const sycl::exception&)
@@ -456,7 +487,7 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
     try
     {
         auto device_cpu2 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu2_queue(device_cpu2);
+        sycl::queue cpu2_queue{device_cpu2, prop_list};
         u.push_back(cpu2_queue);
     }
     catch (const sycl::exception&)
@@ -466,7 +497,7 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
     try
     {
         auto device_cpu3 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu3_queue(device_cpu3);
+        sycl::queue cpu3_queue{device_cpu3, prop_list};
         u.push_back(cpu3_queue);
     }
     catch (const sycl::exception&)
@@ -476,7 +507,7 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
     try
     {
         auto device_cpu4 = sycl::device(sycl::cpu_selector_v);
-        sycl::queue cpu4_queue(device_cpu4);
+        sycl::queue cpu4_queue{device_cpu4, prop_list};
         u.push_back(cpu4_queue);
     }
     catch (const sycl::exception&)
@@ -484,7 +515,8 @@ build_auto_tune_universe(std::vector<sycl::queue>& u)
         std::cout << "SKIPPED: Unable to run with cpu_selector\n";
     }
 }
-#endif
+
+#endif //TEST_DYNAMIC_SELECTION_AVAILABLE
 
 int
 main()
@@ -494,52 +526,77 @@ main()
 #if TEST_DYNAMIC_SELECTION_AVAILABLE
 #if !ONEDPL_FPGA_DEVICE || !ONEDPL_FPGA_EMULATOR
     using policy_t = oneapi::dpl::experimental::auto_tune_policy<oneapi::dpl::experimental::sycl_backend>;
-    std::vector<sycl::queue> u;
-    build_auto_tune_universe(u);
+    std::vector<sycl::queue> u1;
+    std::vector<sycl::queue> u2;
+    constexpr bool use_event_profiling = true;
+    build_auto_tune_universe(u1);
+    build_auto_tune_universe<use_event_profiling>(u2);
 
-    //If building the universe is not a success, return
-    if (u.size() != 0)
+    if (u1.size() != 0 || u2.size() !=0 )
     {
-        auto f = [u](int i)
-        {
+        auto f = [u1](int i) {
             if (i <= 8)
-                return u[(i - 1) % 4];
+                return u1[(i - 1) % 4];
             else
-                return u[0];
+                return u1[0];
         };
 
         constexpr bool just_call_submit = false;
         constexpr bool call_select_before_submit = true;
 
-        auto actual = test_auto_initialization(u);
-        actual = test_select<policy_t, decltype(u), const decltype(f)&, true>(u, f);
-        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 0);
-        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 1);
-        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 2);
-        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u, 3);
-        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 0);
-        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 1);
-        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 2);
-        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u, 3);
-        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u, 0);
-        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u, 1);
-        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u, 2);
-        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u, 3);
-
+        auto actual = test_auto_initialization(u1);
+        actual = test_select<policy_t, decltype(u1), const decltype(f)&, true>(u1, f);
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 0);
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 1);
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 2);
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u1, 3);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 0);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 1);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 2);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u1, 3);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 0);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 1);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 2);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u1, 3);
+        // now select then submits
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 0);
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 1);
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 2);
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u1, 3);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 0);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 1);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 2);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u1, 3);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 0);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 1);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 2);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 3);
+        // Use event profiling
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 0);
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 1);
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 2);
+        actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 3);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 0);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 1);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 2);
+        actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 3);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 0);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 1);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 2);
+        actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 3);
         // now select then submits
-        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 0);
-        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 1);
-        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 2);
-        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u, 3);
-        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 0);
-        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 1);
-        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 2);
-        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u, 3);
-
-        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 0);
-        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 1);
-        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 2);
-        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u, 3);
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 0);
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 1);
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 2);
+        actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 3);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 0);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 1);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 2);
+        actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 3);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 0);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 1);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 2);
+        actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 3);
 
         bProcessed = true;
     }
diff --git a/test/support/inline_backend.h b/test/support/inline_backend.h
index 301dfdf1315..24c0e7cfeaa 100644
--- a/test/support/inline_backend.h
+++ b/test/support/inline_backend.h
@@ -14,6 +14,7 @@
 
 #include <vector>
 #include <atomic>
+#include <chrono>
 
 namespace TestUtils
 {
@@ -50,6 +51,7 @@ class int_inline_backend_t
     using wait_type = int;
     using execution_resource_t = basic_execution_resource_t<resource_type>;
     using resource_container_t = std::vector<execution_resource_t>;
+    using report_duration = std::chrono::milliseconds;
 
   private:
     using native_resource_container_t = std::vector<resource_type>;
@@ -99,8 +101,8 @@ class int_inline_backend_t
     submit(SelectionHandle s, Function&& f, Args&&... args)
     {
         std::chrono::steady_clock::time_point t0;
-        if constexpr (oneapi::dpl::experimental::report_value_v<SelectionHandle,
-                                                                oneapi::dpl::experimental::execution_info::task_time_t>)
+        if constexpr (oneapi::dpl::experimental::report_value_v<
+                          SelectionHandle, oneapi::dpl::experimental::execution_info::task_time_t, report_duration>)
         {
             t0 = std::chrono::steady_clock::now();
         }
@@ -116,11 +118,11 @@ class int_inline_backend_t
         {
             oneapi::dpl::experimental::report(s, oneapi::dpl::experimental::execution_info::task_completion);
         }
-        if constexpr (oneapi::dpl::experimental::report_value_v<SelectionHandle,
-                                                                oneapi::dpl::experimental::execution_info::task_time_t>)
+        if constexpr (oneapi::dpl::experimental::report_value_v<
+                          SelectionHandle, oneapi::dpl::experimental::execution_info::task_time_t, report_duration>)
         {
             report(s, oneapi::dpl::experimental::execution_info::task_time,
-                   (std::chrono::steady_clock::now() - t0).count());
+                   std::chrono::duration_cast<report_duration>(std::chrono::steady_clock::now() - t0));
         }
         return async_waiter{w};
     }