oneapi-src · yihanwg · Sep 24, 2024 · Sep 24, 2024
@@ -224,14 +224,18 @@ RewriterMap dpct::createClassMethodsRewriterMap() {
       // cub::BlockStore.Store
       HEADER_INSERT_FACTORY(
           HeaderType::HT_DPCT_GROUP_Utils,
-          CONDITIONAL_FACTORY_ENTRY(
-              makeCheckAnd(CheckArgCount(2), CheckCUBEnumTemplateArg(3)),
-              MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store",
-                                        MemberExprBase(), false, "store",
-                                        NDITEM, ARG(0), ARG(1)),
-              UNSUPPORT_FACTORY_ENTRY("cub::BlockStore.Store",
-                                      Diagnostics::API_NOT_MIGRATED,
-                                      printCallExprPretty())))
+          CASE_FACTORY_ENTRY(
+              CASE(makeCheckAnd(CheckArgCount(2), CheckCUBEnumTemplateArg(3)),
+                   MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store",
+                                             MemberExprBase(), false, "store",
+                                             NDITEM, ARG(0), ARG(1))),
+              CASE(makeCheckAnd(CheckArgCount(3), CheckCUBEnumTemplateArg(3)),
+                   MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store",
+                                             MemberExprBase(), false, "store",
+                                             NDITEM, ARG(0), ARG(1), ARG(2))),
+              OTHERWISE(UNSUPPORT_FACTORY_ENTRY("cub::BlockStore.Store",
+                                                Diagnostics::API_NOT_MIGRATED,
+                                                printCallExprPretty()))))
 
   };
 }
@@ -844,6 +844,60 @@ __dpct_inline__ void store_direct_striped(const ItemT &item,
     work_item_iter[i * work_group_size] = data[i];
 }
 
+/// Store a blocked arrangement of items across a work-group into a linear
+/// segment of items, guarded by range.
+///
+/// \tparam T The data type to store.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam OutputIteratorT  The random-access iterator type for output.
+/// \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param output_iter The work-group's base output iterator for writing.
+/// \param data Data to store.
+/// \param valid_items Number of valid items to load
+template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
+          typename ItemT>
+__dpct_inline__ void
+store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
+                     T (&data)[ElementsPerWorkItem], size_t valid_items) {
+  size_t work_item_id = item.get_local_linear_id();
+  OutputIteratorT work_item_iter =
+      output_iter + (work_item_id * ElementsPerWorkItem);
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    if (i + (work_item_id * ElementsPerWorkItem) < valid_items)
+      work_item_iter[i] = data[i];
+}
+
+/// Store a striped arrangement of items across a work-group into a linear
+/// segment of items, guarded by range.
+///
+/// \tparam T The data type to store.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam OutputIteratorT  The random-access iterator type for output.
+/// \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param output_iter The work-group's base output iterator for writing.
+/// \param items Data to store.
+/// \param valid_items Number of valid items to load
+template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
+          typename ItemT>
+__dpct_inline__ void
+store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
+                     T (&data)[ElementsPerWorkItem], size_t valid_items) {
+  size_t work_group_size = item.get_group().get_local_linear_range();
+  size_t work_item_id = item.get_local_linear_id();
+  OutputIteratorT work_item_iter = output_iter + work_item_id;
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    if ((i * work_group_size) + work_item_id < valid_items)
+      work_item_iter[i * work_group_size] = data[i];
+}
+
 // loads a linear segment of workgroup items into a subgroup striped
 // arrangement. Created as free function until exchange mechanism is
 // implemented.
@@ -1020,6 +1074,42 @@ class group_store {
           item, output_iter, data);
     }
   }
+
+  /// Store items into a linear segment of memory, guarded by range.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
+  /// \p valid_items is 5, the \p output across the work-group is:
+  ///
+  ///   {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}.
+  ///
+  /// The blocked order \p output will be:
+  ///
+  ///   0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0.
+  ///
+  /// The striped order \p output will be:
+  ///
+  ///   0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0.
+  ///
+  /// \tparam ItemT The sycl::nd_item index space class.
+  /// \tparam OutputIteratorT The random-access iterator type for \p output
+  /// iterator.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param data The data to store.
+  /// \param valid_items Number of valid items to load
+  template <typename ItemT, typename OutputIteratorT>
+  __dpct_inline__ void store(const ItemT &item, OutputIteratorT output_iter,
+                             T (&data)[ElementsPerWorkItem],
+                             size_t valid_items) {
+    if constexpr (StoreAlgorithm == group_store_algorithm::blocked) {
+      store_direct_blocked<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
+          item, output_iter, data, valid_items);
+    } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) {
+      store_direct_striped<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
+          item, output_iter, data, valid_items);
+    }
+  }
 };
 } // namespace group
 } // namespace dpct

@@ -0,0 +1,77 @@
+// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
+// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
+// UNSUPPORTED: system-windows
+// RUN: dpct -in-root %S -out-root %T/blocklevel/blockstore %S/blockstore.cu --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
+// RUN: FileCheck --input-file %T/blocklevel/blockstore/blockstore.dp.cpp --match-full-lines %s
+// RUN: %if build_lit %{icpx -c -fsycl %T/blocklevel/blockstore/blockstore.dp.cpp -o %T/blocklevel/blockstore/blockstore.dp.o %}
+
+#include <cub/cub.cuh>
+
+__global__ void BlockedKernel(int *d_data, int valid_items) {
+  // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+  // CHECK: using BlockStore = dpct::group::group_store<int, 4>;
+  using BlockStore = cub::BlockStore<int, 128, 4>;
+
+  __shared__ typename BlockStore::TempStorage temp_storage;
+
+  int thread_data[4];
+  thread_data[0] = threadIdx.x * 4 + 0;
+  thread_data[1] = threadIdx.x * 4 + 1;
+  thread_data[2] = threadIdx.x * 4 + 2;
+  thread_data[3] = threadIdx.x * 4 + 3;
+
+  // CHECK: BlockStore(temp_storage).store(item_ct1, d_data, thread_data, valid_items);
+  BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+}
+
+__global__ void StripedKernel(int *d_data, int valid_items) {
+  // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+  // CHECK: using BlockStore = dpct::group::group_store<int, 4, dpct::group::group_store_algorithm::striped>;
+  using BlockStore = cub::BlockStore<int, 128, 4, cub::BLOCK_STORE_STRIPED>;
+
+  __shared__ typename BlockStore::TempStorage temp_storage;
+
+  int thread_data[4];
+  thread_data[0] = threadIdx.x * 4 + 0;
+  thread_data[1] = threadIdx.x * 4 + 1;
+  thread_data[2] = threadIdx.x * 4 + 2;
+  thread_data[3] = threadIdx.x * 4 + 3;
+  // CHECK: BlockStore(temp_storage).store(item_ct1, d_data, thread_data, valid_items);
+  BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+}
+
+int main() {
+  int *d_data;
+  cudaMallocManaged(&d_data, sizeof(int) * 512);
+  cudaMemset(d_data, 0, sizeof(int) * 512);
+  // CHECK: q_ct1.submit(
+  // CHECK-NEXT:   [&](sycl::handler &cgh) {
+  // CHECK-NEXT:     sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_store<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
+  // CHECK-EMPTY:
+  // CHECK-NEXT:     cgh.parallel_for(
+  // CHECK-NEXT:       sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
+  // CHECK-NEXT:       [=](sycl::nd_item<3> item_ct1) {
+  // CHECK-NEXT:         BlockedKernel(d_data, 5, item_ct1, &temp_storage_acc[0]);
+  // CHECK-NEXT:       });
+  // CHECK-NEXT:   });
+  BlockedKernel<<<1, 128>>>(d_data, 5);
+  cudaStreamSynchronize(0);
+  for (int i = 0; i < 512; ++i)
+    printf("%d%c", d_data[i], (i == 511 ? '\n' : ' '));
+  cudaMemset(d_data, 0, sizeof(int) * 512);
+  // CHECK: q_ct1.submit(
+  // CHECK-NEXT:   [&](sycl::handler &cgh) {
+  // CHECK-NEXT:     sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_store<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
+  // CHECK-EMPTY:
+  // CHECK-NEXT:     cgh.parallel_for(
+  // CHECK-NEXT:       sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
+  // CHECK-NEXT:       [=](sycl::nd_item<3> item_ct1) {
+  // CHECK-NEXT:         StripedKernel(d_data, 5, item_ct1, &temp_storage_acc[0]);
+  // CHECK-NEXT:       });
+  // CHECK-NEXT:   });
+  StripedKernel<<<1, 128>>>(d_data, 5);
+  cudaStreamSynchronize(0);
+  for (int i = 0; i < 512; ++i)
+    printf("%d%c", d_data[i], (i == 511 ? '\n' : ' '));
+  return 0;
+}