Block Sptrsv fixes (#2376)

* Go back to original impl with vector reduce Signed-off-by: James Foucar <[email protected]> * Disable tp1 test for clang+cuda+complex Signed-off-by: James Foucar <[email protected]> * Fixes and warn users about broken case Signed-off-by: James Foucar <[email protected]> * formatting Signed-off-by: James Foucar <[email protected]> * Fix comments and print skip message Signed-off-by: James Foucar <[email protected]> --------- Signed-off-by: James Foucar <[email protected]>
kokkos · Oct 15, 2024 · b844669 · b844669
1 parent ae06347
commit b844669
Show file tree

Hide file tree

Showing 3 changed files with 196 additions and 96 deletions.
diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -79,7 +79,7 @@ struct SptrsvWrap {
   using range_type      = Kokkos::pair<int, int>;
 
   // Tag structs
-  struct UnsortedTag {};  //  This doesn't appear to be supported
+  struct UnsortedTag {};
   struct LargerCutoffTag {};
   struct UnsortedLargerCutoffTag {};
 
@@ -115,7 +115,9 @@ struct SptrsvWrap {
     RHSType rhs;
     entries_t nodes_grouped_by_level;
 
-    using reftype = scalar_t &;
+    using reftype   = scalar_t &;
+    using ArrayType = reftype;
+    using SumArray  = reftype;
 
     struct SBlock {
       template <typename T>
@@ -141,6 +143,16 @@ struct SptrsvWrap {
     KOKKOS_INLINE_FUNCTION
     size_type get_block_size() const { return 0; }
 
+    // multiply_subtract. C -= A * B
+    KOKKOS_INLINE_FUNCTION
+    static void multiply_subtract(const scalar_t &a, const scalar_t &b, scalar_t &c) { c -= a * b; }
+
+    KOKKOS_INLINE_FUNCTION
+    static void copy(const member_type &, scalar_t &, const scalar_t &) {}
+
+    KOKKOS_INLINE_FUNCTION
+    static void copy(scalar_t &, const scalar_t &) {}
+
     // lget
     KOKKOS_INLINE_FUNCTION
     scalar_t &lget(const size_type row) const { return lhs(row); }
@@ -195,6 +207,60 @@ struct SptrsvWrap {
 
     using reftype = Vector;
 
+    struct ArrayType {
+      scalar_t m_data[MAX_VEC_SIZE];
+
+      KOKKOS_INLINE_FUNCTION
+      ArrayType() { init(); }
+
+      KOKKOS_INLINE_FUNCTION
+      ArrayType(const ArrayType &rhs_) {
+        for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = rhs_.m_data[i];
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      ArrayType(const Vector &) { init(); }
+
+      KOKKOS_INLINE_FUNCTION
+      void init() {
+        for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = 0;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      ArrayType &operator+=(const ArrayType &rhs_) {
+        for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i];
+        return *this;
+      }
+    };
+
+    struct SumArray {
+      using reducer          = SumArray;
+      using value_type       = ArrayType;
+      using result_view_type = Kokkos::View<value_type *, execution_space, Kokkos::MemoryUnmanaged>;
+
+     private:
+      value_type &m_value;
+
+     public:
+      KOKKOS_INLINE_FUNCTION
+      SumArray(value_type &value) : m_value(value) {}
+
+      KOKKOS_INLINE_FUNCTION
+      void join(value_type &dest, const value_type &src) const { dest += src; }
+
+      KOKKOS_INLINE_FUNCTION
+      void init(value_type &val) const { val.init(); }
+
+      KOKKOS_INLINE_FUNCTION
+      value_type &reference() const { return m_value; }
+
+      KOKKOS_INLINE_FUNCTION
+      result_view_type view() const { return result_view_type(&m_value, 1); }
+
+      KOKKOS_INLINE_FUNCTION
+      bool reference_scalar() const { return true; }
+    };
+
     RowMapType row_map;
     EntriesType entries;
     ValuesType values;
@@ -215,6 +281,7 @@ struct SptrsvWrap {
           block_size(block_size_),
           block_items(block_size * block_size) {
       KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?");
+      KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, "Max supported block size is " << MAX_VEC_SIZE);
     }
 
     KOKKOS_INLINE_FUNCTION
@@ -257,17 +324,17 @@ struct SptrsvWrap {
       team.team_barrier();
       KokkosBatched::TeamLU<member_type, KokkosBatched::Algo::LU::Blocked>::invoke(team, LU);
 
-      // A = LU
-      // A^-1 = U^-1 * L^-1
-      // b = (b * U^-1) * L^-1, so do U trsv first
+      // Ax = LUx = Lz = b, we use the change of variable z = U*x
+      // z = L^-1 * b, first we solve for z, storing the result back into b
+      // x = U^-1 * z, second we solve for x, again storing the result back into b
+      team.team_barrier();
+      KokkosBatched::TeamTrsv<member_type, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
+                              KokkosBatched::Diag::Unit, KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b);
+
       team.team_barrier();
       KokkosBatched::TeamTrsv<member_type, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose,
                               KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU,
                                                                                                         b);
-
-      team.team_barrier();
-      KokkosBatched::TeamTrsv<member_type, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
-                              KokkosBatched::Diag::Unit, KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b);
     }
 
     // serial divide. b /= A (b = b * A^-1)
@@ -278,21 +345,44 @@ struct SptrsvWrap {
 
       // Need a temp block to do LU of A
       const auto block_size_ = b.size();
-      KK_KERNEL_REQUIRE_MSG(block_size_ <= MAX_VEC_SIZE,
-                            "Max supported block size for range-policy is 16. Use team-policy alg if you need more.");
-
       Block LU(&buff[0], block_size_, block_size_);
       assign(LU, A);
       KokkosBatched::SerialLU<KokkosBatched::Algo::LU::Blocked>::invoke(LU);
 
-      // A = LU
-      // A^-1 = U^-1 * L^-1
-      // b = (b * U^-1) * L^-1, so do U trsv first
+      // Ax = LUx = Lz = b, we use the change of variable z = U*x
+      // z = L^-1 * b, first we solve for z, storing the result back into b
+      // x = U^-1 * z, second we solve for x, again storing the result back into b
+      KokkosBatched::SerialTrsv<KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
+                                KokkosBatched::Diag::Unit, KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b);
+
       KokkosBatched::SerialTrsv<KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose,
                                 KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b);
+    }
 
-      KokkosBatched::SerialTrsv<KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
-                                KokkosBatched::Diag::Unit, KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b);
+    // multiply_subtract. C -= A * B
+    KOKKOS_INLINE_FUNCTION
+    static void multiply_subtract(const CBlock &A, const CVector &b, ArrayType &ca) {
+      Vector c(&ca.m_data[0], b.size());
+      multiply_subtract(A, b, c);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void multiply_subtract(const CBlock &A, const CVector &b, Vector &c) {
+      // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1
+      KokkosBlas::SerialGemv<KokkosBlas::Trans::NoTranspose, KokkosBlas::Algo::Gemv::Blocked>::invoke(-1.0, A, b, 1.0,
+                                                                                                      c);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void copy(const member_type &team, const Vector &lhs_, ArrayType &rhsa) {
+      CVector rhs_(&rhsa.m_data[0], lhs_.size());
+      assign(team, lhs_, rhs_);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void copy(const Vector &lhs_, ArrayType &rhsa) {
+      CVector rhs_(&rhsa.m_data[0], lhs_.size());
+      assign(lhs_, rhs_);
     }
 
     // lget
@@ -331,58 +421,69 @@ struct SptrsvWrap {
    */
   template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, bool BlockEnabled>
   struct Intermediate : public Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType, BlockEnabled> {
-    using Base = Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType, BlockEnabled>;
+    using Base    = Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType, BlockEnabled>;
+    using accum_t = std::conditional_t<BlockEnabled, typename Base::ArrayType, scalar_t>;
 
     Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_,
                  const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0)
         : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {}
 
-    struct ReduceFunctorBasic {
+    struct ReduceSumFunctor {
       const Base *m_obj;
+      const lno_t rowid;
+      lno_t diag;
 
       KOKKOS_INLINE_FUNCTION
-      ReduceFunctorBasic(const Base *obj, const lno_t = 0) : m_obj(obj) {}
-
-      KOKKOS_INLINE_FUNCTION
-      static void multiply_subtract(const scalar_t &val, const scalar_t &lhs_col_val, scalar_t &accum) {
-        accum -= val * lhs_col_val;
-      }
-
-      KOKKOS_INLINE_FUNCTION
-      void operator()(size_type i, scalar_t &accum) const {
+      void operator()(size_type i, accum_t &accum) const {
         const auto colid = m_obj->entries(i);
-        multiply_subtract(m_obj->vget(i), m_obj->lget(colid), accum);
+        auto val         = m_obj->vget(i);
+        auto lhs_colid   = m_obj->lget(colid);
+        // accum -= val * lhs_colid;
+        if constexpr (BlockEnabled) {
+          accum_t temp;
+          Base::multiply_subtract(val, lhs_colid, temp);
+          accum += temp;
+        } else {
+          Base::multiply_subtract(val, lhs_colid, accum);
+        }
+        KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag");
       }
     };
 
-    struct ReduceFunctorBlock : public ReduceFunctorBasic {
-      using P = ReduceFunctorBasic;
-
-      const size_type block_size;
-      const size_type b;
-
-      KOKKOS_INLINE_FUNCTION
-      ReduceFunctorBlock(const Base *obj, const size_type block_size_, const size_type b_, const lno_t = 0)
-          : P(obj), block_size(block_size_), b(b_) {}
+    struct ReduceSumDiagFunctor {
+      const Base *m_obj;
+      const lno_t rowid;
+      mutable lno_t diag;
 
       KOKKOS_INLINE_FUNCTION
-      void operator()(size_type i, scalar_t &accum) const {
-        const auto idx   = i / block_size;
-        const auto colid = P::m_obj->entries(idx);
-        P::multiply_subtract(P::m_obj->vget(idx)(b, i % block_size), P::m_obj->lget(colid)(b), accum);
+      void operator()(size_type i, accum_t &accum) const {
+        const auto colid = m_obj->entries(i);
+        if (colid != rowid) {
+          auto val       = m_obj->vget(i);
+          auto lhs_colid = m_obj->lget(colid);
+          // accum -= val * lhs_colid;
+          if constexpr (BlockEnabled) {
+            accum_t temp;
+            Base::multiply_subtract(val, lhs_colid, temp);
+            accum += temp;
+          } else {
+            Base::multiply_subtract(val, lhs_colid, accum);
+          }
+        } else {
+          diag = i;
+        }
       }
     };
 
-    /**
-     * If we want to support Unsorted, we'll need a Functor that returns the ptr
-     * of the diag item (colid == rowid). Possibly via multi-reduce? The UnsortedTag
-     * is defined above but no policies actually use it.
-     */
-
     template <bool IsSerial, bool IsSorted, bool IsLower, bool UseThreadVec = false>
     KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, const int my_rank, const long node_count) const {
+      using reduce_item_t = typename Base::ArrayType;
+      using reducer_t     = typename Base::SumArray;
+      using functor_t     = std::conditional_t<IsSorted, ReduceSumFunctor, ReduceSumDiagFunctor>;
+
+      static_assert(!((!IsSerial && BlockEnabled) && UseThreadVec),
+                    "ThreadVectorRanges are not yet supported for block-enabled");
       static_assert(!(IsSerial && UseThreadVec), "Requested thread vector range in serial?");
-      static_assert(IsSorted, "Unsorted is not yet supported.");
 
       const auto rowid   = Base::nodes_grouped_by_level(my_rank + node_count);
       const auto soffset = Base::row_map(rowid);
@@ -394,76 +495,58 @@ struct SptrsvWrap {
       const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0);
 
       // We don't need the reducer to find the diag item if sorted
+      functor_t rf{this, rowid, -1};
       typename Base::reftype lhs_val = Base::lget(rowid);
-
-      const auto block_size_ = BlockEnabled ? Base::get_block_size() : 1;
-      (void)block_size_;  // Some settings do not use this var
+      reduce_item_t reduce           = lhs_val;
 
       if constexpr (IsSerial) {
         KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial");
         KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?");
-        if constexpr (BlockEnabled) {
-          for (size_type b = 0; b < block_size_; ++b) {
-            ReduceFunctorBlock rf(this, block_size_, b, rowid);
-            for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) {
-              rf(i, lhs_val(b));
-            }
-          }
-        } else {
-          ReduceFunctorBasic rf(this, rowid);
-          for (size_type i = itr_b; i < itr_e; ++i) {
-            rf(i, lhs_val);
-          }
+        for (auto ptr = itr_b; ptr < itr_e; ++ptr) {
+          rf(ptr, reduce);
         }
+        Base::copy(lhs_val, reduce);
       } else {
         KK_KERNEL_ASSERT_MSG(team != nullptr, "Cannot do team operations without team");
         if constexpr (!UseThreadVec) {
-          if constexpr (BlockEnabled) {
-            Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size_), [&](size_type b) {
-              ReduceFunctorBlock rf(this, block_size_, b, rowid);
-              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size_, itr_e * block_size_), rf,
-                                      lhs_val(b));
-            });
-          } else {
-            ReduceFunctorBasic rf(this, rowid);
-            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, lhs_val);
-          }
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, reducer_t(reduce));
+          team->team_barrier();
+          Base::copy(*team, lhs_val, reduce);
           team->team_barrier();
         } else {
-          if constexpr (BlockEnabled) {
-            Kokkos::parallel_for(Kokkos::ThreadVectorRange(*team, block_size_), [&](size_type b) {
-              ReduceFunctorBlock rf(this, block_size_, b, rowid);
-              for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) {
-                rf(i, lhs_val(b));
-              }
-            });
-          } else {
-            ReduceFunctorBasic rf(this, rowid);
-            Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val);
-          }
+          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce));
+          Base::copy(lhs_val, reduce);
         }
       }
 
       // If sorted, we already know the diag. Otherwise, get it from the reducer
-      const lno_t diag = IsLower ? eoffset - 1 : soffset;
+      rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag;
 
       // At end, handle the diag element. We need to be careful to avoid race
       // conditions here.
       if constexpr (IsSerial) {
         // Serial case is easy, there's only 1 thread so just do the
         // add_and_divide
-        KK_KERNEL_ASSERT_MSG(diag != -1, "Serial should always know diag");
-        Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag));
+        KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag");
+        Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag));
       } else {
-        // Parallel sorted case is complex. All threads know what the diag is.
-        // If we have a team sharing the work, we need to ensure only one
-        // thread performs the add_and_divide (except in BlockEnabled, then
-        // we can use team operations).
-        KK_KERNEL_ASSERT_MSG(diag != -1, "Sorted should always know diag");
-        if constexpr (!UseThreadVec) {
-          Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(diag));
+        if constexpr (IsSorted) {
+          // Parallel sorted case is complex. All threads know what the diag is.
+          // If we have a team sharing the work, we need to ensure only one
+          // thread performs the add_and_divide (except in BlockEnabled, then
+          // we can use team operations).
+          KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag");
+          if constexpr (!UseThreadVec) {
+            Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(rf.diag));
+          } else {
+            Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag));
+          }
         } else {
-          Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag));
+          // Parallel unsorted case. Only one thread should know what the diag
+          // item is. We have that one do the add_and_divide.
+          if (rf.diag != -1) {
+            Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag));
+          }
         }
       }
     }