diff --git a/clang/lib/DPCT/RulesLangLib/CUB/RewriterClassMethods.cpp b/clang/lib/DPCT/RulesLangLib/CUB/RewriterClassMethods.cpp index 0dabaa56e23e..755e4dc69040 100644 --- a/clang/lib/DPCT/RulesLangLib/CUB/RewriterClassMethods.cpp +++ b/clang/lib/DPCT/RulesLangLib/CUB/RewriterClassMethods.cpp @@ -224,14 +224,18 @@ RewriterMap dpct::createClassMethodsRewriterMap() { // cub::BlockStore.Store HEADER_INSERT_FACTORY( HeaderType::HT_DPCT_GROUP_Utils, - CONDITIONAL_FACTORY_ENTRY( - makeCheckAnd(CheckArgCount(2), CheckCUBEnumTemplateArg(3)), - MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store", - MemberExprBase(), false, "store", - NDITEM, ARG(0), ARG(1)), - UNSUPPORT_FACTORY_ENTRY("cub::BlockStore.Store", - Diagnostics::API_NOT_MIGRATED, - printCallExprPretty()))) + CASE_FACTORY_ENTRY( + CASE(makeCheckAnd(CheckArgCount(2), CheckCUBEnumTemplateArg(3)), + MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store", + MemberExprBase(), false, "store", + NDITEM, ARG(0), ARG(1))), + CASE(makeCheckAnd(CheckArgCount(3), CheckCUBEnumTemplateArg(3)), + MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store", + MemberExprBase(), false, "store", + NDITEM, ARG(0), ARG(1), ARG(2))), + OTHERWISE(UNSUPPORT_FACTORY_ENTRY("cub::BlockStore.Store", + Diagnostics::API_NOT_MIGRATED, + printCallExprPretty())))) }; } diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index 545191b59482..e0fc0a050109 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -844,6 +844,60 @@ __dpct_inline__ void store_direct_striped(const ItemT &item, work_item_iter[i * work_group_size] = data[i]; } +/// Store a blocked arrangement of items across a work-group into a linear +/// segment of items, guarded by range. +/// +/// \tparam T The data type to store. +/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned +/// onto each work-item. +/// \tparam OutputIteratorT The random-access iterator type for output. +/// \iterator. +/// \tparam ItemT The sycl::nd_item index space class. +/// \param item The calling work-item. +/// \param output_iter The work-group's base output iterator for writing. +/// \param data Data to store. +/// \param valid_items Number of valid items to load +template +__dpct_inline__ void +store_direct_blocked(const ItemT &item, OutputIteratorT output_iter, + T (&data)[ElementsPerWorkItem], size_t valid_items) { + size_t work_item_id = item.get_local_linear_id(); + OutputIteratorT work_item_iter = + output_iter + (work_item_id * ElementsPerWorkItem); +#pragma unroll + for (size_t i = 0; i < ElementsPerWorkItem; i++) + if (i + (work_item_id * ElementsPerWorkItem) < valid_items) + work_item_iter[i] = data[i]; +} + +/// Store a striped arrangement of items across a work-group into a linear +/// segment of items, guarded by range. +/// +/// \tparam T The data type to store. +/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned +/// onto each work-item. +/// \tparam OutputIteratorT The random-access iterator type for output. +/// \iterator. +/// \tparam ItemT The sycl::nd_item index space class. +/// \param item The calling work-item. +/// \param output_iter The work-group's base output iterator for writing. +/// \param items Data to store. +/// \param valid_items Number of valid items to load +template +__dpct_inline__ void +store_direct_striped(const ItemT &item, OutputIteratorT output_iter, + T (&data)[ElementsPerWorkItem], size_t valid_items) { + size_t work_group_size = item.get_group().get_local_linear_range(); + size_t work_item_id = item.get_local_linear_id(); + OutputIteratorT work_item_iter = output_iter + work_item_id; +#pragma unroll + for (size_t i = 0; i < ElementsPerWorkItem; i++) + if ((i * work_group_size) + work_item_id < valid_items) + work_item_iter[i * work_group_size] = data[i]; +} + // loads a linear segment of workgroup items into a subgroup striped // arrangement. Created as free function until exchange mechanism is // implemented. @@ -1020,6 +1074,42 @@ class group_store { item, output_iter, data); } } + + /// Store items into a linear segment of memory, guarded by range. + /// + /// Suppose 512 integer data elements partitioned across 128 work-items, where + /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and + /// \p valid_items is 5, the \p output across the work-group is: + /// + /// {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}. + /// + /// The blocked order \p output will be: + /// + /// 0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0. + /// + /// The striped order \p output will be: + /// + /// 0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0. + /// + /// \tparam ItemT The sycl::nd_item index space class. + /// \tparam OutputIteratorT The random-access iterator type for \p output + /// iterator. + /// \param item The work-item identifier. + /// \param input The input data of each work-item. + /// \param data The data to store. + /// \param valid_items Number of valid items to load + template + __dpct_inline__ void store(const ItemT &item, OutputIteratorT output_iter, + T (&data)[ElementsPerWorkItem], + size_t valid_items) { + if constexpr (StoreAlgorithm == group_store_algorithm::blocked) { + store_direct_blocked( + item, output_iter, data, valid_items); + } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) { + store_direct_striped( + item, output_iter, data, valid_items); + } + } }; } // namespace group } // namespace dpct diff --git a/clang/test/dpct/cub/blocklevel/blockstore.cu b/clang/test/dpct/cub/blocklevel/blockstore.cu new file mode 100644 index 000000000000..36eb9ad7334f --- /dev/null +++ b/clang/test/dpct/cub/blocklevel/blockstore.cu @@ -0,0 +1,77 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2 +// UNSUPPORTED: system-windows +// RUN: dpct -in-root %S -out-root %T/blocklevel/blockstore %S/blockstore.cu --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only +// RUN: FileCheck --input-file %T/blocklevel/blockstore/blockstore.dp.cpp --match-full-lines %s +// RUN: %if build_lit %{icpx -c -fsycl %T/blocklevel/blockstore/blockstore.dp.cpp -o %T/blocklevel/blockstore/blockstore.dp.o %} + +#include + +__global__ void BlockedKernel(int *d_data, int valid_items) { + // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + // CHECK: using BlockStore = dpct::group::group_store; + using BlockStore = cub::BlockStore; + + __shared__ typename BlockStore::TempStorage temp_storage; + + int thread_data[4]; + thread_data[0] = threadIdx.x * 4 + 0; + thread_data[1] = threadIdx.x * 4 + 1; + thread_data[2] = threadIdx.x * 4 + 2; + thread_data[3] = threadIdx.x * 4 + 3; + + // CHECK: BlockStore(temp_storage).store(item_ct1, d_data, thread_data, valid_items); + BlockStore(temp_storage).Store(d_data, thread_data, valid_items); +} + +__global__ void StripedKernel(int *d_data, int valid_items) { + // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + // CHECK: using BlockStore = dpct::group::group_store; + using BlockStore = cub::BlockStore; + + __shared__ typename BlockStore::TempStorage temp_storage; + + int thread_data[4]; + thread_data[0] = threadIdx.x * 4 + 0; + thread_data[1] = threadIdx.x * 4 + 1; + thread_data[2] = threadIdx.x * 4 + 2; + thread_data[3] = threadIdx.x * 4 + 3; + // CHECK: BlockStore(temp_storage).store(item_ct1, d_data, thread_data, valid_items); + BlockStore(temp_storage).Store(d_data, thread_data, valid_items); +} + +int main() { + int *d_data; + cudaMallocManaged(&d_data, sizeof(int) * 512); + cudaMemset(d_data, 0, sizeof(int) * 512); + // CHECK: q_ct1.submit( + // CHECK-NEXT: [&](sycl::handler &cgh) { + // CHECK-NEXT: sycl::local_accessor temp_storage_acc(dpct::group::group_store::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh); + // CHECK-EMPTY: + // CHECK-NEXT: cgh.parallel_for( + // CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), + // CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) { + // CHECK-NEXT: BlockedKernel(d_data, 5, item_ct1, &temp_storage_acc[0]); + // CHECK-NEXT: }); + // CHECK-NEXT: }); + BlockedKernel<<<1, 128>>>(d_data, 5); + cudaStreamSynchronize(0); + for (int i = 0; i < 512; ++i) + printf("%d%c", d_data[i], (i == 511 ? '\n' : ' ')); + cudaMemset(d_data, 0, sizeof(int) * 512); + // CHECK: q_ct1.submit( + // CHECK-NEXT: [&](sycl::handler &cgh) { + // CHECK-NEXT: sycl::local_accessor temp_storage_acc(dpct::group::group_store::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh); + // CHECK-EMPTY: + // CHECK-NEXT: cgh.parallel_for( + // CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), + // CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) { + // CHECK-NEXT: StripedKernel(d_data, 5, item_ct1, &temp_storage_acc[0]); + // CHECK-NEXT: }); + // CHECK-NEXT: }); + StripedKernel<<<1, 128>>>(d_data, 5); + cudaStreamSynchronize(0); + for (int i = 0; i < 512; ++i) + printf("%d%c", d_data[i], (i == 511 ? '\n' : ' ')); + return 0; +}