From b3c4d41ce862a03ea0ee6a102437d12a2d22cedf Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Tue, 12 Nov 2024 19:29:04 +0100
Subject: [PATCH] add spectral decomposition fallback on GPU

---
 .../dal/algo/linear_regression/test/batch.cpp |   3 +-
 .../algo/linear_regression/test/fixture.hpp   |  78 +++----
 .../backend/primitives/lapack/solve_dpc.cpp   | 191 +++++++++++++++++-
 3 files changed, 229 insertions(+), 43 deletions(-)

diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp
index a5585202c31..33f690d2110 100644
--- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp
+++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp
@@ -51,9 +51,10 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) {
 }
 
 TEMPLATE_LIST_TEST_M(lr_batch_test, "LR with non-PSD matrix", "[lr][batch-nonpsd]", lr_types) {
-    SKIP_IF(this->non_psd_system_not_supported_on_device());
+    SKIP_IF(this->not_float64_friendly());
 
     this->generate(777);
+
     this->run_and_check_linear_indefinite();
     this->run_and_check_linear_indefinite_multioutput();
 }
diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp
index e1e54092552..bfba2abf9e5 100644
--- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp
+++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp
@@ -65,10 +65,6 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         return static_cast<Derived*>(this);
     }
 
-    bool non_psd_system_not_supported_on_device() {
-        return this->get_policy().is_gpu();
-    }
-
     table compute_responses(const table& beta, const table& bias, const table& data) const {
         const auto s_count = data.get_row_count();
 
@@ -299,18 +295,20 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
     here is not positive-definite, thus it has an infinite number of possible solutions. The
     solution here is the one with minimum norm, which is typically more desirable. */
     void run_and_check_linear_indefinite(double tol = 1e-3) {
-        const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
-                             0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
-                             0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
-        const double y[] = { 0.13632112, 1.53203308, -0.65996941 };
+        const auto dtype =
+            std::is_same<float_t, double>::value ? data_type::float64 : data_type::float32;
+        const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
+                              0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
+                              0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
+        const float_t y[] = { 0.13632112, 1.53203308, -0.65996941 };
         auto X_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 5)
                          .copy_data(X, 3, 5)
                          .build();
         auto y_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 1)
                          .copy_data(y, 3, 1)
@@ -321,20 +319,20 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         const auto coefs = train_res.get_coefficients();
 
         if (desc.get_result_options().test(result_options::intercept)) {
-            const double expected_beta[] = { 0.27785494,
-                                             0.53011669,
-                                             0.34352259,
-                                             0.40506216,
-                                             -1.26026447 };
-            const double expected_intercept[] = { 1.24485441 };
+            const float_t expected_beta[] = { 0.27785494,
+                                              0.53011669,
+                                              0.34352259,
+                                              0.40506216,
+                                              -1.26026447 };
+            const float_t expected_intercept[] = { 1.24485441 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(1, 5)
                                                .copy_data(expected_beta, 1, 5)
                                                .build();
             const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder()
-                                                    .set_data_type(data_type::float64)
+                                                    .set_data_type(dtype)
                                                     .set_layout(data_layout::row_major)
                                                     .allocate(1, 1)
                                                     .copy_data(expected_intercept, 1, 1)
@@ -351,13 +349,13 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         }
 
         else {
-            const double expected_beta[] = { 0.38217445,
-                                             0.2732197,
-                                             1.87135517,
-                                             0.63458468,
-                                             -2.08473134 };
+            const float_t expected_beta[] = { 0.38217445,
+                                              0.2732197,
+                                              1.87135517,
+                                              0.63458468,
+                                              -2.08473134 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(1, 5)
                                                .copy_data(expected_beta, 1, 5)
@@ -369,19 +367,21 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
     }
 
     void run_and_check_linear_indefinite_multioutput(double tol = 1e-3) {
-        const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
-                             0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
-                             0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
-        const double y[] = { 0.13632112,  1.53203308, -0.65996941,
-                             -0.31179486, 0.33776913, -2.2074711 };
+        const auto dtype =
+            std::is_same<float_t, double>::value ? data_type::float64 : data_type::float32;
+        const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
+                              0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
+                              0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
+        const float_t y[] = { 0.13632112,  1.53203308, -0.65996941,
+                              -0.31179486, 0.33776913, -2.2074711 };
         auto X_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 5)
                          .copy_data(X, 3, 5)
                          .build();
         auto y_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 2)
                          .copy_data(y, 3, 2)
@@ -392,19 +392,19 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         const auto coefs = train_res.get_coefficients();
 
         if (desc.get_result_options().test(result_options::intercept)) {
-            const double expected_beta[] = {
+            const float_t expected_beta[] = {
                 -0.18692112, -0.20034801, -0.09590892, -0.13672683, 0.56229012,
                 -0.97006008, 1.39413595,  0.49238012,  1.11041239,  -0.79213452,
             };
-            const double expected_intercept[] = { -0.48964358, 0.96467681 };
+            const float_t expected_intercept[] = { -0.48964358, 0.96467681 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(2, 5)
                                                .copy_data(expected_beta, 2, 5)
                                                .build();
             const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder()
-                                                    .set_data_type(data_type::float64)
+                                                    .set_data_type(dtype)
                                                     .set_layout(data_layout::row_major)
                                                     .allocate(1, 2)
                                                     .copy_data(expected_intercept, 1, 2)
@@ -421,11 +421,11 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         }
 
         else {
-            const double expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585,
-                                             0.88658098,  -0.88921961, 1.19505839,  1.67634561,
-                                             1.2882766,   -1.43103981 };
+            const float_t expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585,
+                                              0.88658098,  -0.88921961, 1.19505839,  1.67634561,
+                                              1.2882766,   -1.43103981 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(2, 5)
                                                .copy_data(expected_beta, 2, 5)
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 223f6753f69..77ffd3e2540 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -14,6 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <limits>
+
 #include "oneapi/dal/backend/primitives/lapack.hpp"
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/backend/primitives/ndindexer.hpp"
@@ -59,6 +61,167 @@ inline sycl::event beta_copy_transform(sycl::queue& queue,
     });
 }
 
+/*
+This is an adaptation of the CPU version, which can be found in this file:
+cpp/daal/src/algorithms/service_kernel_math.h
+
+It solves a linear system A*x = b
+where 'b' might be a matrix (multiple RHS)
+
+It is intended as a fallback for solving linear regression, where these
+matrices are obtained as follows:
+    A = t(X)*X
+    b = t(X)*y
+
+It can handle systems that are not positive semi-definite, so it's used
+as a fallback when Cholesky fails or when it involves too small numbers
+(which tends to happen when floating point error results in a slightly
+positive matrix when it should have zero determinant in theory).
+*/
+template <mkl::uplo uplo, typename Float>
+sycl::event solve_spectral_decomposition(
+    sycl::queue& queue,
+    ndview<Float, 2>& A, // t(X)*X, LHS, gets overwritten
+    sycl::event& event_A,
+    ndview<Float, 2>& b, // t(X)*y, RHS, solution is outputted here
+    sycl::event& event_b,
+    const std::int64_t dim_A,
+    const std::int64_t nrhs) {
+    constexpr auto alloc = sycl::usm::alloc::device;
+
+    /* Decompose: A = Q * diag(l) * t(Q) */
+    /* Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on */
+    auto eigenvalues = array<Float>::empty(queue, dim_A * nrhs, alloc);
+    auto eigenvalues_view = ndview<Float, 1>::wrap(eigenvalues);
+    sycl::event syevd_event =
+        syevd<mkl::job::vec, uplo, Float>(queue, dim_A, A, dim_A, eigenvalues_view, { event_A });
+    const Float eps = std::numeric_limits<Float>::epsilon();
+
+    /* Discard too small singular values */
+    std::int64_t num_discarded;
+    {
+        /* This is placed inside a block because the array created here is not used any further */
+        auto eigenvalues_cpu = eigenvalues_view.to_host(queue, { syevd_event });
+        const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data();
+        const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1];
+        if (largest_ev <= eps) {
+            throw std::runtime_error(
+                "Could not solve linear system. Problem has too small singular values.");
+        }
+        const Float component_threshold = eps * largest_ev;
+        for (num_discarded = 0; num_discarded < dim_A - 1; num_discarded++) {
+            if (eigenvalues_cpu_ptr[num_discarded] > component_threshold)
+                break;
+        }
+    }
+
+    /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */
+    std::int64_t num_taken = dim_A - num_discarded;
+    auto ev_mutable = eigenvalues.get_mutable_data();
+    sycl::event inv_sqrt_eigenvalues_event = queue.submit([&](sycl::handler& h) {
+        h.depends_on(syevd_event);
+        h.parallel_for(num_taken, [=](const auto& i) {
+            const std::size_t ix = i + num_discarded;
+            ev_mutable[ix] = sycl::sqrt(Float(1) / ev_mutable[ix]);
+        });
+    });
+
+    auto Q_mutable = A.get_mutable_data();
+    sycl::event inv_sqrt_eigenvectors_event = queue.submit([&](sycl::handler& h) {
+        const auto range = oneapi::dal::backend::make_range_2d(num_taken, dim_A);
+        h.depends_on(inv_sqrt_eigenvalues_event);
+        h.parallel_for(range, [=](sycl::id<2> id) {
+            const std::size_t col = id[0] + num_discarded;
+            const std::size_t row = id[1];
+            Q_mutable[row + col * dim_A] *= ev_mutable[col];
+        });
+    });
+
+    /* Now calculate the actual solution: Qis * Qis' * B */
+    const std::int64_t eigenvectors_offset = num_discarded * dim_A;
+    if (nrhs == 1) {
+        sycl::event gemv_right_event =
+            mkl::blas::column_major::gemv(queue,
+                                          mkl::transpose::trans,
+                                          dim_A,
+                                          num_taken,
+                                          Float(1),
+                                          Q_mutable + eigenvectors_offset,
+                                          dim_A,
+                                          b.get_data(),
+                                          1,
+                                          Float(0),
+                                          ev_mutable,
+                                          1,
+                                          { inv_sqrt_eigenvectors_event, event_b });
+        return mkl::blas::column_major::gemv(queue,
+                                             mkl::transpose::nontrans,
+                                             dim_A,
+                                             num_taken,
+                                             Float(1),
+                                             Q_mutable + eigenvectors_offset,
+                                             dim_A,
+                                             ev_mutable,
+                                             1,
+                                             Float(0),
+                                             b.get_mutable_data(),
+                                             1,
+                                             { gemv_right_event });
+    }
+
+    else {
+        sycl::event gemm_right_event =
+            mkl::blas::column_major::gemm(queue,
+                                          mkl::transpose::trans,
+                                          mkl::transpose::nontrans,
+                                          num_taken,
+                                          nrhs,
+                                          dim_A,
+                                          Float(1),
+                                          Q_mutable + eigenvectors_offset,
+                                          dim_A,
+                                          b.get_data(),
+                                          dim_A,
+                                          Float(0),
+                                          ev_mutable,
+                                          num_taken,
+                                          { inv_sqrt_eigenvectors_event, event_b });
+        return mkl::blas::column_major::gemm(queue,
+                                             mkl::transpose::nontrans,
+                                             mkl::transpose::nontrans,
+                                             dim_A,
+                                             nrhs,
+                                             num_taken,
+                                             Float(1),
+                                             Q_mutable + eigenvectors_offset,
+                                             dim_A,
+                                             ev_mutable,
+                                             num_taken,
+                                             Float(0),
+                                             b.get_mutable_data(),
+                                             dim_A,
+                                             { gemm_right_event });
+    }
+}
+
+/* Returns the minimum value among entries in the diagonal of a square matrix */
+template <typename Float>
+Float diagonal_minimum(sycl::queue& queue,
+                       const Float* Matrix,
+                       const std::int64_t dim_matrix,
+                       sycl::event& event_Matrix) {
+    constexpr auto alloc = sycl::usm::alloc::device;
+    auto diag_min_holder = array<Float>::empty(queue, 1, alloc);
+    sycl::event diag_min_event = queue.submit([&](sycl::handler& h) {
+        auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>());
+        h.depends_on(event_Matrix);
+        h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) {
+            min_obj.combine(Matrix[i * (dim_matrix + 1)]);
+        });
+    });
+    return ndview<Float, 1>::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event });
+}
+
 template <mkl::uplo uplo, bool beta, typename Float, ndorder xlayout, ndorder ylayout>
 sycl::event solve_system(sycl::queue& queue,
                          const ndview<Float, 2, xlayout>& xtx,
@@ -70,12 +233,34 @@ sycl::event solve_system(sycl::queue& queue,
 
     auto [nxty, xty_event] = copy<ndorder::c, Float, ylayout, alloc>(queue, xty, dependencies);
     auto [nxtx, xtx_event] = copy<ndorder::c, Float, xlayout, alloc>(queue, xtx, dependencies);
+    const std::int64_t dim_xtx = xtx.get_dimension(0);
+
+    sycl::event solution_event;
 
     opt_array<Float> dummy{};
-    auto potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
-    auto potrs_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
+    try {
+        sycl::event potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
+        const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event);
+        if (diag_min <= 1e-6)
+            throw mkl::exception("");
+        solution_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
+    }
+    catch (mkl::exception& ex) {
+        const std::int64_t nrhs = nxty.get_dimension(0);
+        /* Note: this templated version of 'copy' reuses the layout that was specified in the previous copy */
+        sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies);
+        sycl::event xty_event_new = copy(queue, nxty, xty, dependencies);
+
+        solution_event = solve_spectral_decomposition<uplo, Float>(queue,
+                                                                   nxtx,
+                                                                   xtx_event_new,
+                                                                   nxty,
+                                                                   xty_event_new,
+                                                                   dim_xtx,
+                                                                   nrhs);
+    }
 
-    return beta_copy_transform<beta>(queue, nxty, final_xty, { potrs_event });
+    return beta_copy_transform<beta>(queue, nxty, final_xty, { solution_event });
 }
 
 #define INSTANTIATE(U, B, F, XL, YL)                                 \