Skip to content

Commit

Permalink
[SYCL] Naive implementation for group_load_store extension (intel#13043)
Browse files Browse the repository at this point in the history
  • Loading branch information
aelovikov-intel authored Apr 19, 2024
1 parent 10f12be commit 144c2bc
Show file tree
Hide file tree
Showing 10 changed files with 745 additions and 1 deletion.
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ template <typename T>
struct is_generic_group
: std::integral_constant<bool,
is_group<T>::value || is_sub_group<T>::value> {};
template <typename T>
inline constexpr bool is_generic_group_v = is_generic_group<T>::value;

namespace half_impl {
class half;
Expand Down
190 changes: 190 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
//==---- group_load_store.hpp --- SYCL extension for group loads/stores ----==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// Implements sycl_ext_oneapi_group_load_store extension.

#pragma once

#include <sycl/ext/oneapi/properties/properties.hpp>

namespace sycl {
inline namespace _V1 {
namespace ext::oneapi::experimental {

enum class data_placement_enum { blocked, striped };

struct data_placement_key
: detail::compile_time_property_key<detail::PropKind::DataPlacement> {
template <data_placement_enum Placement>
using value_t =
property_value<data_placement_key,
// TODO: Extension uses data_placement_enum directly here.
std::integral_constant<int, static_cast<int>(Placement)>>;
};

template <data_placement_enum Placement>
inline constexpr data_placement_key::value_t<Placement> data_placement;

inline constexpr data_placement_key::value_t<data_placement_enum::blocked>
data_placement_blocked;
inline constexpr data_placement_key::value_t<data_placement_enum::striped>
data_placement_striped;

struct contiguous_memory_key
: detail::compile_time_property_key<detail::PropKind::ContiguousMemory> {
using value_t = property_value<contiguous_memory_key>;
};

inline constexpr contiguous_memory_key::value_t contiguous_memory;

struct full_group_key
: detail::compile_time_property_key<detail::PropKind::FullGroup> {
using value_t = property_value<full_group_key>;
};

inline constexpr full_group_key::value_t full_group;

namespace detail {
using namespace sycl::detail;

template <typename InputIteratorT, typename OutputElemT>
inline constexpr bool verify_load_types =
std::is_same_v<
typename std::iterator_traits<InputIteratorT>::iterator_category,
std::random_access_iterator_tag> &&
std::is_convertible_v<remove_decoration_t<typename std::iterator_traits<
InputIteratorT>::value_type>,
OutputElemT> &&
std::is_trivially_copyable_v<remove_decoration_t<
typename std::iterator_traits<InputIteratorT>::value_type>> &&
std::is_default_constructible_v<remove_decoration_t<
typename std::iterator_traits<InputIteratorT>::value_type>> &&
std::is_trivially_copyable_v<OutputElemT> &&
std::is_default_constructible_v<OutputElemT>;

template <typename InputElemT, typename OutputIteratorT>
inline constexpr bool verify_store_types =
std::is_same_v<
typename std::iterator_traits<OutputIteratorT>::iterator_category,
std::random_access_iterator_tag> &&
std::is_convertible_v<InputElemT,
remove_decoration_t<typename std::iterator_traits<
OutputIteratorT>::value_type>> &&
std::is_trivially_copyable_v<remove_decoration_t<
typename std::iterator_traits<OutputIteratorT>::value_type>> &&
std::is_default_constructible_v<remove_decoration_t<
typename std::iterator_traits<OutputIteratorT>::value_type>> &&
std::is_trivially_copyable_v<InputElemT> &&
std::is_default_constructible_v<InputElemT>;

template <typename Properties> constexpr bool isBlocked(Properties properties) {
if constexpr (properties.template has_property<data_placement_key>())
return properties.template get_property<data_placement_key>() ==
data_placement_blocked;
else
return true;
}

template <bool IsBlocked, int VEC_OR_ARRAY_SIZE, typename GroupTy>
int get_mem_idx(GroupTy g, int vec_or_array_idx) {
if constexpr (IsBlocked)
return g.get_local_linear_id() * VEC_OR_ARRAY_SIZE + vec_or_array_idx;
else
return g.get_local_linear_id() +
g.get_local_linear_range() * vec_or_array_idx;
}
} // namespace detail

#ifdef __SYCL_DEVICE_ONLY__
// Load API span overload.
template <typename Group, typename InputIteratorT, typename OutputT,
std::size_t ElementsPerWorkItem,
typename Properties = decltype(properties())>
std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> &&
detail::is_generic_group_v<Group>>
group_load(Group g, InputIteratorT in_ptr,
span<OutputT, ElementsPerWorkItem> out, Properties properties = {}) {
constexpr bool blocked = detail::isBlocked(properties);

group_barrier(g);
for (int i = 0; i < out.size(); ++i)
out[i] = in_ptr[detail::get_mem_idx<blocked, ElementsPerWorkItem>(g, i)];
group_barrier(g);
}

// Store API span overload.
template <typename Group, typename InputT, std::size_t ElementsPerWorkItem,
typename OutputIteratorT,
typename Properties = decltype(properties())>
std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> &&
detail::is_generic_group_v<Group>>
group_store(Group g, const span<InputT, ElementsPerWorkItem> in,
OutputIteratorT out_ptr, Properties properties = {}) {
constexpr bool blocked = detail::isBlocked(properties);

group_barrier(g);
for (int i = 0; i < in.size(); ++i)
out_ptr[detail::get_mem_idx<blocked, ElementsPerWorkItem>(g, i)] = in[i];
group_barrier(g);
}

// Load API scalar.
template <typename Group, typename InputIteratorT, typename OutputT,
typename Properties = decltype(properties())>
std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> &&
detail::is_generic_group_v<Group>>
group_load(Group g, InputIteratorT in_ptr, OutputT &out,
Properties properties = {}) {
group_load(g, in_ptr, span<OutputT, 1>(&out, 1), properties);
}

// Store API scalar.
template <typename Group, typename InputT, typename OutputIteratorT,
typename Properties = decltype(properties())>
std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> &&
detail::is_generic_group_v<Group>>
group_store(Group g, const InputT &in, OutputIteratorT out_ptr,
Properties properties = {}) {
group_store(g, span<const InputT, 1>(&in, 1), out_ptr, properties);
}

// Load API sycl::vec overload.
template <typename Group, typename InputIteratorT, typename OutputT, int N,
typename Properties = decltype(properties())>
std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> &&
detail::is_generic_group_v<Group>>
group_load(Group g, InputIteratorT in_ptr, sycl::vec<OutputT, N> &out,
Properties properties = {}) {
group_load(g, in_ptr, span<OutputT, N>(&out[0], N), properties);
}

// Store API sycl::vec overload.
template <typename Group, typename InputT, int N, typename OutputIteratorT,
typename Properties = decltype(properties())>
std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> &&
detail::is_generic_group_v<Group>>
group_store(Group g, const sycl::vec<InputT, N> &in, OutputIteratorT out_ptr,
Properties properties = {}) {
group_store(g, span<const InputT, N>(&in[0], N), out_ptr, properties);
}

#else
template <typename... Args> void group_load(Args...) {
throw sycl::exception(
std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
"Group loads/stores are not supported on host.");
}
template <typename... Args> void group_store(Args...) {
throw sycl::exception(
std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
"Group loads/stores are not supported on host.");
}
#endif
} // namespace ext::oneapi::experimental
} // namespace _V1
} // namespace sycl
5 changes: 4 additions & 1 deletion sycl/include/sycl/ext/oneapi/properties/property.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,11 @@ enum PropKind : uint32_t {
Balanced = 55,
InvocationCapacity = 56,
ResponseCapacity = 57,
DataPlacement = 58,
ContiguousMemory = 59,
FullGroup = 60,
// PropKindSize must always be the last value.
PropKindSize = 58,
PropKindSize = 61,
};

struct property_key_base_tag {};
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/sycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
#include <sycl/ext/oneapi/experimental/composite_device.hpp>
#include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
#include <sycl/ext/oneapi/experimental/fixed_size_group.hpp>
#include <sycl/ext/oneapi/experimental/group_load_store.hpp>
#include <sycl/ext/oneapi/experimental/opportunistic_group.hpp>
#include <sycl/ext/oneapi/experimental/prefetch.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
Expand Down
110 changes: 110 additions & 0 deletions sycl/test-e2e/GroupAlgorithm/load_store/basic.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include <sycl/sycl.hpp>

int main() {
using namespace sycl;
namespace sycl_exp = sycl::ext::oneapi::experimental;

constexpr std::size_t wg_size = 32;
constexpr std::size_t n_wgs = 2;
constexpr std::size_t global_size = n_wgs * wg_size;
constexpr std::size_t elems_per_wi = 4;
constexpr std::size_t n = global_size * elems_per_wi;

queue q;

buffer<int, 1> input_buf{n};

{
host_accessor acc{input_buf};
std::iota(acc.begin(), acc.end(), 0);
}

buffer<int, 1> load_blocked_default_buf{n};
buffer<int, 1> load_blocked_buf{n};
buffer<int, 1> load_striped_buf{n};
buffer<int, 1> store_blocked_default_buf{n};
buffer<int, 1> store_blocked_buf{n};
buffer<int, 1> store_striped_buf{n};

q.submit([&](handler &cgh) {
accessor input{input_buf, cgh};

accessor load_blocked_default{load_blocked_default_buf, cgh};
accessor load_blocked{load_blocked_buf, cgh};
accessor load_striped{load_striped_buf, cgh};
accessor store_blocked_default{store_blocked_default_buf, cgh};
accessor store_blocked{store_blocked_buf, cgh};
accessor store_striped{store_striped_buf, cgh};

cgh.parallel_for(nd_range<1>{global_size, wg_size}, [=](nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
auto g = ndi.get_group();
auto offset = g.get_group_id(0) * g.get_local_range(0) * elems_per_wi;

int data[elems_per_wi];

auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked};
auto striped = sycl_exp::properties{sycl_exp::data_placement_striped};

// default
sycl_exp::group_load(g, input.begin() + offset, span{data});
for (int i = 0; i < elems_per_wi; ++i)
load_blocked_default[gid * elems_per_wi + i] = data[i];

// blocked
sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked);
for (int i = 0; i < elems_per_wi; ++i)
load_blocked[gid * elems_per_wi + i] = data[i];

// striped
sycl_exp::group_load(g, input.begin() + offset, span{data}, striped);
for (int i = 0; i < elems_per_wi; ++i)
load_striped[gid * elems_per_wi + i] = data[i];

// Stores:

std::iota(std::begin(data), std::end(data), gid * elems_per_wi);

sycl_exp::group_store(g, span{data},
store_blocked_default.begin() + offset);
sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset,
blocked);
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset,
striped);
});
});

host_accessor load_blocked_default{load_blocked_default_buf};
host_accessor load_blocked{load_blocked_buf};
host_accessor load_striped{load_striped_buf};
host_accessor store_blocked_default{store_blocked_default_buf};
host_accessor store_blocked{store_blocked_buf};
host_accessor store_striped{store_striped_buf};

// Check blocked.
for (int i = 0; i < global_size * elems_per_wi; ++i) {
assert(load_blocked_default[i] == i);
assert(load_blocked[i] == i);
assert(store_blocked_default[i] == i);
assert(store_blocked[i] == i);
}

// Check striped.
for (int wi = 0; wi < global_size; ++wi) {
auto group = wi / wg_size;
auto lid = wi % wg_size;

for (auto elem = 0; elem < elems_per_wi; ++elem) {
auto striped_idx = group * wg_size * elems_per_wi + elem * wg_size + lid;
assert(load_striped[wi * elems_per_wi + elem] == striped_idx);

auto value_stored = wi * elems_per_wi + elem;
assert(store_striped[striped_idx] == value_stored);
}
}

return 0;
}
49 changes: 49 additions & 0 deletions sycl/test-e2e/GroupAlgorithm/load_store/conversions_load.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include <sycl/sycl.hpp>

struct S {
S() : i(-1) {}
S(int i) : i(i + 42) {}

int i;
};

int main() {
using namespace sycl;
namespace sycl_exp = sycl::ext::oneapi::experimental;

constexpr std::size_t wg_size = 16;

queue q;

buffer<int, 1> input_buf{wg_size * 2};
{
host_accessor acc{input_buf};
std::iota(acc.begin(), acc.end(), 0);
}
buffer<bool, 1> success_buf{wg_size};

q.submit([&](handler &cgh) {
accessor input{input_buf, cgh};
accessor success{success_buf, cgh};
cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
auto g = ndi.get_group();

S data[2];
sycl_exp::group_load(g, input.begin(), span{data});

bool ok = true;
ok &= (data[0].i == gid * 2 + 0 + 42);
ok &= (data[1].i == gid * 2 + 1 + 42);
success[gid] = ok;
});
});

for (bool wi_success : host_accessor{success_buf})
assert(wi_success);

return 0;
}
Loading

0 comments on commit 144c2bc

Please sign in to comment.