From b3c4d41ce862a03ea0ee6a102437d12a2d22cedf Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 12 Nov 2024 19:29:04 +0100 Subject: [PATCH] add spectral decomposition fallback on GPU --- .../dal/algo/linear_regression/test/batch.cpp | 3 +- .../algo/linear_regression/test/fixture.hpp | 78 +++---- .../backend/primitives/lapack/solve_dpc.cpp | 191 +++++++++++++++++- 3 files changed, 229 insertions(+), 43 deletions(-) diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp index a5585202c31..33f690d2110 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp @@ -51,9 +51,10 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) { } TEMPLATE_LIST_TEST_M(lr_batch_test, "LR with non-PSD matrix", "[lr][batch-nonpsd]", lr_types) { - SKIP_IF(this->non_psd_system_not_supported_on_device()); + SKIP_IF(this->not_float64_friendly()); this->generate(777); + this->run_and_check_linear_indefinite(); this->run_and_check_linear_indefinite_multioutput(); } diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp index e1e54092552..bfba2abf9e5 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp @@ -65,10 +65,6 @@ class lr_test : public te::crtp_algo_fixture { return static_cast(this); } - bool non_psd_system_not_supported_on_device() { - return this->get_policy().is_gpu(); - } - table compute_responses(const table& beta, const table& bias, const table& data) const { const auto s_count = data.get_row_count(); @@ -299,18 +295,20 @@ class lr_test : public te::crtp_algo_fixture { here is not positive-definite, thus it has an infinite number of possible solutions. The solution here is the one with minimum norm, which is typically more desirable. */ void run_and_check_linear_indefinite(double tol = 1e-3) { - const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, - 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, - 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; - const double y[] = { 0.13632112, 1.53203308, -0.65996941 }; + const auto dtype = + std::is_same::value ? data_type::float64 : data_type::float32; + const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const float_t y[] = { 0.13632112, 1.53203308, -0.65996941 }; auto X_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 5) .copy_data(X, 3, 5) .build(); auto y_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 1) .copy_data(y, 3, 1) @@ -321,20 +319,20 @@ class lr_test : public te::crtp_algo_fixture { const auto coefs = train_res.get_coefficients(); if (desc.get_result_options().test(result_options::intercept)) { - const double expected_beta[] = { 0.27785494, - 0.53011669, - 0.34352259, - 0.40506216, - -1.26026447 }; - const double expected_intercept[] = { 1.24485441 }; + const float_t expected_beta[] = { 0.27785494, + 0.53011669, + 0.34352259, + 0.40506216, + -1.26026447 }; + const float_t expected_intercept[] = { 1.24485441 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 5) .copy_data(expected_beta, 1, 5) .build(); const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 1) .copy_data(expected_intercept, 1, 1) @@ -351,13 +349,13 @@ class lr_test : public te::crtp_algo_fixture { } else { - const double expected_beta[] = { 0.38217445, - 0.2732197, - 1.87135517, - 0.63458468, - -2.08473134 }; + const float_t expected_beta[] = { 0.38217445, + 0.2732197, + 1.87135517, + 0.63458468, + -2.08473134 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 5) .copy_data(expected_beta, 1, 5) @@ -369,19 +367,21 @@ class lr_test : public te::crtp_algo_fixture { } void run_and_check_linear_indefinite_multioutput(double tol = 1e-3) { - const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, - 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, - 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; - const double y[] = { 0.13632112, 1.53203308, -0.65996941, - -0.31179486, 0.33776913, -2.2074711 }; + const auto dtype = + std::is_same::value ? data_type::float64 : data_type::float32; + const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const float_t y[] = { 0.13632112, 1.53203308, -0.65996941, + -0.31179486, 0.33776913, -2.2074711 }; auto X_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 5) .copy_data(X, 3, 5) .build(); auto y_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 2) .copy_data(y, 3, 2) @@ -392,19 +392,19 @@ class lr_test : public te::crtp_algo_fixture { const auto coefs = train_res.get_coefficients(); if (desc.get_result_options().test(result_options::intercept)) { - const double expected_beta[] = { + const float_t expected_beta[] = { -0.18692112, -0.20034801, -0.09590892, -0.13672683, 0.56229012, -0.97006008, 1.39413595, 0.49238012, 1.11041239, -0.79213452, }; - const double expected_intercept[] = { -0.48964358, 0.96467681 }; + const float_t expected_intercept[] = { -0.48964358, 0.96467681 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(2, 5) .copy_data(expected_beta, 2, 5) .build(); const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 2) .copy_data(expected_intercept, 1, 2) @@ -421,11 +421,11 @@ class lr_test : public te::crtp_algo_fixture { } else { - const double expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585, - 0.88658098, -0.88921961, 1.19505839, 1.67634561, - 1.2882766, -1.43103981 }; + const float_t expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585, + 0.88658098, -0.88921961, 1.19505839, 1.67634561, + 1.2882766, -1.43103981 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(2, 5) .copy_data(expected_beta, 2, 5) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index 223f6753f69..77ffd3e2540 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -14,6 +14,8 @@ * limitations under the License. *******************************************************************************/ +#include + #include "oneapi/dal/backend/primitives/lapack.hpp" #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/backend/primitives/ndindexer.hpp" @@ -59,6 +61,167 @@ inline sycl::event beta_copy_transform(sycl::queue& queue, }); } +/* +This is an adaptation of the CPU version, which can be found in this file: +cpp/daal/src/algorithms/service_kernel_math.h + +It solves a linear system A*x = b +where 'b' might be a matrix (multiple RHS) + +It is intended as a fallback for solving linear regression, where these +matrices are obtained as follows: + A = t(X)*X + b = t(X)*y + +It can handle systems that are not positive semi-definite, so it's used +as a fallback when Cholesky fails or when it involves too small numbers +(which tends to happen when floating point error results in a slightly +positive matrix when it should have zero determinant in theory). +*/ +template +sycl::event solve_spectral_decomposition( + sycl::queue& queue, + ndview& A, // t(X)*X, LHS, gets overwritten + sycl::event& event_A, + ndview& b, // t(X)*y, RHS, solution is outputted here + sycl::event& event_b, + const std::int64_t dim_A, + const std::int64_t nrhs) { + constexpr auto alloc = sycl::usm::alloc::device; + + /* Decompose: A = Q * diag(l) * t(Q) */ + /* Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on */ + auto eigenvalues = array::empty(queue, dim_A * nrhs, alloc); + auto eigenvalues_view = ndview::wrap(eigenvalues); + sycl::event syevd_event = + syevd(queue, dim_A, A, dim_A, eigenvalues_view, { event_A }); + const Float eps = std::numeric_limits::epsilon(); + + /* Discard too small singular values */ + std::int64_t num_discarded; + { + /* This is placed inside a block because the array created here is not used any further */ + auto eigenvalues_cpu = eigenvalues_view.to_host(queue, { syevd_event }); + const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data(); + const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1]; + if (largest_ev <= eps) { + throw std::runtime_error( + "Could not solve linear system. Problem has too small singular values."); + } + const Float component_threshold = eps * largest_ev; + for (num_discarded = 0; num_discarded < dim_A - 1; num_discarded++) { + if (eigenvalues_cpu_ptr[num_discarded] > component_threshold) + break; + } + } + + /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */ + std::int64_t num_taken = dim_A - num_discarded; + auto ev_mutable = eigenvalues.get_mutable_data(); + sycl::event inv_sqrt_eigenvalues_event = queue.submit([&](sycl::handler& h) { + h.depends_on(syevd_event); + h.parallel_for(num_taken, [=](const auto& i) { + const std::size_t ix = i + num_discarded; + ev_mutable[ix] = sycl::sqrt(Float(1) / ev_mutable[ix]); + }); + }); + + auto Q_mutable = A.get_mutable_data(); + sycl::event inv_sqrt_eigenvectors_event = queue.submit([&](sycl::handler& h) { + const auto range = oneapi::dal::backend::make_range_2d(num_taken, dim_A); + h.depends_on(inv_sqrt_eigenvalues_event); + h.parallel_for(range, [=](sycl::id<2> id) { + const std::size_t col = id[0] + num_discarded; + const std::size_t row = id[1]; + Q_mutable[row + col * dim_A] *= ev_mutable[col]; + }); + }); + + /* Now calculate the actual solution: Qis * Qis' * B */ + const std::int64_t eigenvectors_offset = num_discarded * dim_A; + if (nrhs == 1) { + sycl::event gemv_right_event = + mkl::blas::column_major::gemv(queue, + mkl::transpose::trans, + dim_A, + num_taken, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + b.get_data(), + 1, + Float(0), + ev_mutable, + 1, + { inv_sqrt_eigenvectors_event, event_b }); + return mkl::blas::column_major::gemv(queue, + mkl::transpose::nontrans, + dim_A, + num_taken, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + ev_mutable, + 1, + Float(0), + b.get_mutable_data(), + 1, + { gemv_right_event }); + } + + else { + sycl::event gemm_right_event = + mkl::blas::column_major::gemm(queue, + mkl::transpose::trans, + mkl::transpose::nontrans, + num_taken, + nrhs, + dim_A, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + b.get_data(), + dim_A, + Float(0), + ev_mutable, + num_taken, + { inv_sqrt_eigenvectors_event, event_b }); + return mkl::blas::column_major::gemm(queue, + mkl::transpose::nontrans, + mkl::transpose::nontrans, + dim_A, + nrhs, + num_taken, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + ev_mutable, + num_taken, + Float(0), + b.get_mutable_data(), + dim_A, + { gemm_right_event }); + } +} + +/* Returns the minimum value among entries in the diagonal of a square matrix */ +template +Float diagonal_minimum(sycl::queue& queue, + const Float* Matrix, + const std::int64_t dim_matrix, + sycl::event& event_Matrix) { + constexpr auto alloc = sycl::usm::alloc::device; + auto diag_min_holder = array::empty(queue, 1, alloc); + sycl::event diag_min_event = queue.submit([&](sycl::handler& h) { + auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>()); + h.depends_on(event_Matrix); + h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) { + min_obj.combine(Matrix[i * (dim_matrix + 1)]); + }); + }); + return ndview::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event }); +} + template sycl::event solve_system(sycl::queue& queue, const ndview& xtx, @@ -70,12 +233,34 @@ sycl::event solve_system(sycl::queue& queue, auto [nxty, xty_event] = copy(queue, xty, dependencies); auto [nxtx, xtx_event] = copy(queue, xtx, dependencies); + const std::int64_t dim_xtx = xtx.get_dimension(0); + + sycl::event solution_event; opt_array dummy{}; - auto potrf_event = potrf_factorization(queue, nxtx, dummy, { xtx_event }); - auto potrs_event = potrs_solution(queue, nxtx, nxty, dummy, { potrf_event, xty_event }); + try { + sycl::event potrf_event = potrf_factorization(queue, nxtx, dummy, { xtx_event }); + const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event); + if (diag_min <= 1e-6) + throw mkl::exception(""); + solution_event = potrs_solution(queue, nxtx, nxty, dummy, { potrf_event, xty_event }); + } + catch (mkl::exception& ex) { + const std::int64_t nrhs = nxty.get_dimension(0); + /* Note: this templated version of 'copy' reuses the layout that was specified in the previous copy */ + sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies); + sycl::event xty_event_new = copy(queue, nxty, xty, dependencies); + + solution_event = solve_spectral_decomposition(queue, + nxtx, + xtx_event_new, + nxty, + xty_event_new, + dim_xtx, + nrhs); + } - return beta_copy_transform(queue, nxty, final_xty, { potrs_event }); + return beta_copy_transform(queue, nxty, final_xty, { solution_event }); } #define INSTANTIATE(U, B, F, XL, YL) \