Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCLomatic] Support migration of cub::BlockStore::Store(OutputIteratorT block_itr, T(&items)[ITEMS_PRE_THREAD], int valid_items) #2374

Open
wants to merge 2 commits into
base: SYCLomatic
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions clang/lib/DPCT/RulesLangLib/CUB/RewriterClassMethods.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,18 @@ RewriterMap dpct::createClassMethodsRewriterMap() {
// cub::BlockStore.Store
HEADER_INSERT_FACTORY(
HeaderType::HT_DPCT_GROUP_Utils,
CONDITIONAL_FACTORY_ENTRY(
makeCheckAnd(CheckArgCount(2), CheckCUBEnumTemplateArg(3)),
MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store",
MemberExprBase(), false, "store",
NDITEM, ARG(0), ARG(1)),
UNSUPPORT_FACTORY_ENTRY("cub::BlockStore.Store",
Diagnostics::API_NOT_MIGRATED,
printCallExprPretty())))
CASE_FACTORY_ENTRY(
CASE(makeCheckAnd(CheckArgCount(2), CheckCUBEnumTemplateArg(3)),
MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store",
MemberExprBase(), false, "store",
NDITEM, ARG(0), ARG(1))),
CASE(makeCheckAnd(CheckArgCount(3), CheckCUBEnumTemplateArg(3)),
MEMBER_CALL_FACTORY_ENTRY("cub::BlockStore.Store",
MemberExprBase(), false, "store",
NDITEM, ARG(0), ARG(1), ARG(2))),
OTHERWISE(UNSUPPORT_FACTORY_ENTRY("cub::BlockStore.Store",
Diagnostics::API_NOT_MIGRATED,
printCallExprPretty()))))

};
}
90 changes: 90 additions & 0 deletions clang/runtime/dpct-rt/include/dpct/group_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,60 @@ __dpct_inline__ void store_direct_striped(const ItemT &item,
work_item_iter[i * work_group_size] = data[i];
}

/// Store a blocked arrangement of items across a work-group into a linear
/// segment of items, guarded by range.
///
/// \tparam T The data type to store.
/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
/// onto each work-item.
/// \tparam OutputIteratorT The random-access iterator type for output.
yihanwg marked this conversation as resolved.
Show resolved Hide resolved
/// \iterator.
/// \tparam ItemT The sycl::nd_item index space class.
/// \param item The calling work-item.
/// \param output_iter The work-group's base output iterator for writing.
/// \param data Data to store.
/// \param valid_items Number of valid items to load
yihanwg marked this conversation as resolved.
Show resolved Hide resolved
template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
typename ItemT>
__dpct_inline__ void
store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
T (&data)[ElementsPerWorkItem], size_t valid_items) {
size_t work_item_id = item.get_local_linear_id();
OutputIteratorT work_item_iter =
output_iter + (work_item_id * ElementsPerWorkItem);
#pragma unroll
for (size_t i = 0; i < ElementsPerWorkItem; i++)
if (i + (work_item_id * ElementsPerWorkItem) < valid_items)
work_item_iter[i] = data[i];
}

/// Store a striped arrangement of items across a work-group into a linear
/// segment of items, guarded by range.
///
/// \tparam T The data type to store.
/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
/// onto each work-item.
/// \tparam OutputIteratorT The random-access iterator type for output.
/// \iterator.
/// \tparam ItemT The sycl::nd_item index space class.
/// \param item The calling work-item.
/// \param output_iter The work-group's base output iterator for writing.
/// \param items Data to store.
/// \param valid_items Number of valid items to load
template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
typename ItemT>
__dpct_inline__ void
store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
T (&data)[ElementsPerWorkItem], size_t valid_items) {
size_t work_group_size = item.get_group().get_local_linear_range();
size_t work_item_id = item.get_local_linear_id();
OutputIteratorT work_item_iter = output_iter + work_item_id;
#pragma unroll
for (size_t i = 0; i < ElementsPerWorkItem; i++)
if ((i * work_group_size) + work_item_id < valid_items)
work_item_iter[i * work_group_size] = data[i];
}

// loads a linear segment of workgroup items into a subgroup striped
// arrangement. Created as free function until exchange mechanism is
// implemented.
Expand Down Expand Up @@ -1020,6 +1074,42 @@ class group_store {
item, output_iter, data);
}
}

/// Store items into a linear segment of memory, guarded by range.
///
/// Suppose 512 integer data elements partitioned across 128 work-items, where
/// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
/// \p valid_items is 5, the \p output across the work-group is:
///
/// {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}.
///
/// The blocked order \p output will be:
///
/// 0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0.
///
/// The striped order \p output will be:
///
/// 0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0.
///
/// \tparam ItemT The sycl::nd_item index space class.
/// \tparam OutputIteratorT The random-access iterator type for \p output
/// iterator.
/// \param item The work-item identifier.
/// \param input The input data of each work-item.
/// \param data The data to store.
/// \param valid_items Number of valid items to load
template <typename ItemT, typename OutputIteratorT>
__dpct_inline__ void store(const ItemT &item, OutputIteratorT output_iter,
T (&data)[ElementsPerWorkItem],
size_t valid_items) {
if constexpr (StoreAlgorithm == group_store_algorithm::blocked) {
store_direct_blocked<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
item, output_iter, data, valid_items);
} else if constexpr (StoreAlgorithm == group_store_algorithm::striped) {
store_direct_striped<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
item, output_iter, data, valid_items);
}
}
};
} // namespace group
} // namespace dpct
Expand Down
77 changes: 77 additions & 0 deletions clang/test/dpct/cub/blocklevel/blockstore.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
// UNSUPPORTED: system-windows
// RUN: dpct -in-root %S -out-root %T/blocklevel/blockstore %S/blockstore.cu --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
// RUN: FileCheck --input-file %T/blocklevel/blockstore/blockstore.dp.cpp --match-full-lines %s
// RUN: %if build_lit %{icpx -c -fsycl %T/blocklevel/blockstore/blockstore.dp.cpp -o %T/blocklevel/blockstore/blockstore.dp.o %}

#include <cub/cub.cuh>

__global__ void BlockedKernel(int *d_data, int valid_items) {
// Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
// CHECK: using BlockStore = dpct::group::group_store<int, 4>;
using BlockStore = cub::BlockStore<int, 128, 4>;

__shared__ typename BlockStore::TempStorage temp_storage;

int thread_data[4];
thread_data[0] = threadIdx.x * 4 + 0;
thread_data[1] = threadIdx.x * 4 + 1;
thread_data[2] = threadIdx.x * 4 + 2;
thread_data[3] = threadIdx.x * 4 + 3;

// CHECK: BlockStore(temp_storage).store(item_ct1, d_data, thread_data, valid_items);
BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
}

__global__ void StripedKernel(int *d_data, int valid_items) {
// Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
// CHECK: using BlockStore = dpct::group::group_store<int, 4, dpct::group::group_store_algorithm::striped>;
using BlockStore = cub::BlockStore<int, 128, 4, cub::BLOCK_STORE_STRIPED>;

__shared__ typename BlockStore::TempStorage temp_storage;

int thread_data[4];
thread_data[0] = threadIdx.x * 4 + 0;
thread_data[1] = threadIdx.x * 4 + 1;
thread_data[2] = threadIdx.x * 4 + 2;
thread_data[3] = threadIdx.x * 4 + 3;
// CHECK: BlockStore(temp_storage).store(item_ct1, d_data, thread_data, valid_items);
BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
}

int main() {
int *d_data;
cudaMallocManaged(&d_data, sizeof(int) * 512);
cudaMemset(d_data, 0, sizeof(int) * 512);
// CHECK: q_ct1.submit(
// CHECK-NEXT: [&](sycl::handler &cgh) {
// CHECK-NEXT: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_store<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
// CHECK-EMPTY:
// CHECK-NEXT: cgh.parallel_for(
// CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
// CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {
// CHECK-NEXT: BlockedKernel(d_data, 5, item_ct1, &temp_storage_acc[0]);
// CHECK-NEXT: });
// CHECK-NEXT: });
BlockedKernel<<<1, 128>>>(d_data, 5);
cudaStreamSynchronize(0);
for (int i = 0; i < 512; ++i)
printf("%d%c", d_data[i], (i == 511 ? '\n' : ' '));
cudaMemset(d_data, 0, sizeof(int) * 512);
// CHECK: q_ct1.submit(
// CHECK-NEXT: [&](sycl::handler &cgh) {
// CHECK-NEXT: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_store<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
// CHECK-EMPTY:
// CHECK-NEXT: cgh.parallel_for(
// CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
// CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {
// CHECK-NEXT: StripedKernel(d_data, 5, item_ct1, &temp_storage_acc[0]);
// CHECK-NEXT: });
// CHECK-NEXT: });
StripedKernel<<<1, 128>>>(d_data, 5);
cudaStreamSynchronize(0);
for (int i = 0; i < 512; ++i)
printf("%d%c", d_data[i], (i == 511 ? '\n' : ' '));
return 0;
}