forked from intel/llvm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SYCL] Naive implementation for group_load_store extension (intel#13043)
- Loading branch information
1 parent
10f12be
commit 144c2bc
Showing
10 changed files
with
745 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
190 changes: 190 additions & 0 deletions
190
sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
//==---- group_load_store.hpp --- SYCL extension for group loads/stores ----==// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
// Implements sycl_ext_oneapi_group_load_store extension. | ||
|
||
#pragma once | ||
|
||
#include <sycl/ext/oneapi/properties/properties.hpp> | ||
|
||
namespace sycl { | ||
inline namespace _V1 { | ||
namespace ext::oneapi::experimental { | ||
|
||
enum class data_placement_enum { blocked, striped }; | ||
|
||
struct data_placement_key | ||
: detail::compile_time_property_key<detail::PropKind::DataPlacement> { | ||
template <data_placement_enum Placement> | ||
using value_t = | ||
property_value<data_placement_key, | ||
// TODO: Extension uses data_placement_enum directly here. | ||
std::integral_constant<int, static_cast<int>(Placement)>>; | ||
}; | ||
|
||
template <data_placement_enum Placement> | ||
inline constexpr data_placement_key::value_t<Placement> data_placement; | ||
|
||
inline constexpr data_placement_key::value_t<data_placement_enum::blocked> | ||
data_placement_blocked; | ||
inline constexpr data_placement_key::value_t<data_placement_enum::striped> | ||
data_placement_striped; | ||
|
||
struct contiguous_memory_key | ||
: detail::compile_time_property_key<detail::PropKind::ContiguousMemory> { | ||
using value_t = property_value<contiguous_memory_key>; | ||
}; | ||
|
||
inline constexpr contiguous_memory_key::value_t contiguous_memory; | ||
|
||
struct full_group_key | ||
: detail::compile_time_property_key<detail::PropKind::FullGroup> { | ||
using value_t = property_value<full_group_key>; | ||
}; | ||
|
||
inline constexpr full_group_key::value_t full_group; | ||
|
||
namespace detail { | ||
using namespace sycl::detail; | ||
|
||
template <typename InputIteratorT, typename OutputElemT> | ||
inline constexpr bool verify_load_types = | ||
std::is_same_v< | ||
typename std::iterator_traits<InputIteratorT>::iterator_category, | ||
std::random_access_iterator_tag> && | ||
std::is_convertible_v<remove_decoration_t<typename std::iterator_traits< | ||
InputIteratorT>::value_type>, | ||
OutputElemT> && | ||
std::is_trivially_copyable_v<remove_decoration_t< | ||
typename std::iterator_traits<InputIteratorT>::value_type>> && | ||
std::is_default_constructible_v<remove_decoration_t< | ||
typename std::iterator_traits<InputIteratorT>::value_type>> && | ||
std::is_trivially_copyable_v<OutputElemT> && | ||
std::is_default_constructible_v<OutputElemT>; | ||
|
||
template <typename InputElemT, typename OutputIteratorT> | ||
inline constexpr bool verify_store_types = | ||
std::is_same_v< | ||
typename std::iterator_traits<OutputIteratorT>::iterator_category, | ||
std::random_access_iterator_tag> && | ||
std::is_convertible_v<InputElemT, | ||
remove_decoration_t<typename std::iterator_traits< | ||
OutputIteratorT>::value_type>> && | ||
std::is_trivially_copyable_v<remove_decoration_t< | ||
typename std::iterator_traits<OutputIteratorT>::value_type>> && | ||
std::is_default_constructible_v<remove_decoration_t< | ||
typename std::iterator_traits<OutputIteratorT>::value_type>> && | ||
std::is_trivially_copyable_v<InputElemT> && | ||
std::is_default_constructible_v<InputElemT>; | ||
|
||
template <typename Properties> constexpr bool isBlocked(Properties properties) { | ||
if constexpr (properties.template has_property<data_placement_key>()) | ||
return properties.template get_property<data_placement_key>() == | ||
data_placement_blocked; | ||
else | ||
return true; | ||
} | ||
|
||
template <bool IsBlocked, int VEC_OR_ARRAY_SIZE, typename GroupTy> | ||
int get_mem_idx(GroupTy g, int vec_or_array_idx) { | ||
if constexpr (IsBlocked) | ||
return g.get_local_linear_id() * VEC_OR_ARRAY_SIZE + vec_or_array_idx; | ||
else | ||
return g.get_local_linear_id() + | ||
g.get_local_linear_range() * vec_or_array_idx; | ||
} | ||
} // namespace detail | ||
|
||
#ifdef __SYCL_DEVICE_ONLY__ | ||
// Load API span overload. | ||
template <typename Group, typename InputIteratorT, typename OutputT, | ||
std::size_t ElementsPerWorkItem, | ||
typename Properties = decltype(properties())> | ||
std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> && | ||
detail::is_generic_group_v<Group>> | ||
group_load(Group g, InputIteratorT in_ptr, | ||
span<OutputT, ElementsPerWorkItem> out, Properties properties = {}) { | ||
constexpr bool blocked = detail::isBlocked(properties); | ||
|
||
group_barrier(g); | ||
for (int i = 0; i < out.size(); ++i) | ||
out[i] = in_ptr[detail::get_mem_idx<blocked, ElementsPerWorkItem>(g, i)]; | ||
group_barrier(g); | ||
} | ||
|
||
// Store API span overload. | ||
template <typename Group, typename InputT, std::size_t ElementsPerWorkItem, | ||
typename OutputIteratorT, | ||
typename Properties = decltype(properties())> | ||
std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> && | ||
detail::is_generic_group_v<Group>> | ||
group_store(Group g, const span<InputT, ElementsPerWorkItem> in, | ||
OutputIteratorT out_ptr, Properties properties = {}) { | ||
constexpr bool blocked = detail::isBlocked(properties); | ||
|
||
group_barrier(g); | ||
for (int i = 0; i < in.size(); ++i) | ||
out_ptr[detail::get_mem_idx<blocked, ElementsPerWorkItem>(g, i)] = in[i]; | ||
group_barrier(g); | ||
} | ||
|
||
// Load API scalar. | ||
template <typename Group, typename InputIteratorT, typename OutputT, | ||
typename Properties = decltype(properties())> | ||
std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> && | ||
detail::is_generic_group_v<Group>> | ||
group_load(Group g, InputIteratorT in_ptr, OutputT &out, | ||
Properties properties = {}) { | ||
group_load(g, in_ptr, span<OutputT, 1>(&out, 1), properties); | ||
} | ||
|
||
// Store API scalar. | ||
template <typename Group, typename InputT, typename OutputIteratorT, | ||
typename Properties = decltype(properties())> | ||
std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> && | ||
detail::is_generic_group_v<Group>> | ||
group_store(Group g, const InputT &in, OutputIteratorT out_ptr, | ||
Properties properties = {}) { | ||
group_store(g, span<const InputT, 1>(&in, 1), out_ptr, properties); | ||
} | ||
|
||
// Load API sycl::vec overload. | ||
template <typename Group, typename InputIteratorT, typename OutputT, int N, | ||
typename Properties = decltype(properties())> | ||
std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> && | ||
detail::is_generic_group_v<Group>> | ||
group_load(Group g, InputIteratorT in_ptr, sycl::vec<OutputT, N> &out, | ||
Properties properties = {}) { | ||
group_load(g, in_ptr, span<OutputT, N>(&out[0], N), properties); | ||
} | ||
|
||
// Store API sycl::vec overload. | ||
template <typename Group, typename InputT, int N, typename OutputIteratorT, | ||
typename Properties = decltype(properties())> | ||
std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> && | ||
detail::is_generic_group_v<Group>> | ||
group_store(Group g, const sycl::vec<InputT, N> &in, OutputIteratorT out_ptr, | ||
Properties properties = {}) { | ||
group_store(g, span<const InputT, N>(&in[0], N), out_ptr, properties); | ||
} | ||
|
||
#else | ||
template <typename... Args> void group_load(Args...) { | ||
throw sycl::exception( | ||
std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), | ||
"Group loads/stores are not supported on host."); | ||
} | ||
template <typename... Args> void group_store(Args...) { | ||
throw sycl::exception( | ||
std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), | ||
"Group loads/stores are not supported on host."); | ||
} | ||
#endif | ||
} // namespace ext::oneapi::experimental | ||
} // namespace _V1 | ||
} // namespace sycl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
// RUN: %{build} -o %t.out | ||
// RUN: %{run} %t.out | ||
|
||
#include <sycl/sycl.hpp> | ||
|
||
int main() { | ||
using namespace sycl; | ||
namespace sycl_exp = sycl::ext::oneapi::experimental; | ||
|
||
constexpr std::size_t wg_size = 32; | ||
constexpr std::size_t n_wgs = 2; | ||
constexpr std::size_t global_size = n_wgs * wg_size; | ||
constexpr std::size_t elems_per_wi = 4; | ||
constexpr std::size_t n = global_size * elems_per_wi; | ||
|
||
queue q; | ||
|
||
buffer<int, 1> input_buf{n}; | ||
|
||
{ | ||
host_accessor acc{input_buf}; | ||
std::iota(acc.begin(), acc.end(), 0); | ||
} | ||
|
||
buffer<int, 1> load_blocked_default_buf{n}; | ||
buffer<int, 1> load_blocked_buf{n}; | ||
buffer<int, 1> load_striped_buf{n}; | ||
buffer<int, 1> store_blocked_default_buf{n}; | ||
buffer<int, 1> store_blocked_buf{n}; | ||
buffer<int, 1> store_striped_buf{n}; | ||
|
||
q.submit([&](handler &cgh) { | ||
accessor input{input_buf, cgh}; | ||
|
||
accessor load_blocked_default{load_blocked_default_buf, cgh}; | ||
accessor load_blocked{load_blocked_buf, cgh}; | ||
accessor load_striped{load_striped_buf, cgh}; | ||
accessor store_blocked_default{store_blocked_default_buf, cgh}; | ||
accessor store_blocked{store_blocked_buf, cgh}; | ||
accessor store_striped{store_striped_buf, cgh}; | ||
|
||
cgh.parallel_for(nd_range<1>{global_size, wg_size}, [=](nd_item<1> ndi) { | ||
auto gid = ndi.get_global_id(0); | ||
auto g = ndi.get_group(); | ||
auto offset = g.get_group_id(0) * g.get_local_range(0) * elems_per_wi; | ||
|
||
int data[elems_per_wi]; | ||
|
||
auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked}; | ||
auto striped = sycl_exp::properties{sycl_exp::data_placement_striped}; | ||
|
||
// default | ||
sycl_exp::group_load(g, input.begin() + offset, span{data}); | ||
for (int i = 0; i < elems_per_wi; ++i) | ||
load_blocked_default[gid * elems_per_wi + i] = data[i]; | ||
|
||
// blocked | ||
sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked); | ||
for (int i = 0; i < elems_per_wi; ++i) | ||
load_blocked[gid * elems_per_wi + i] = data[i]; | ||
|
||
// striped | ||
sycl_exp::group_load(g, input.begin() + offset, span{data}, striped); | ||
for (int i = 0; i < elems_per_wi; ++i) | ||
load_striped[gid * elems_per_wi + i] = data[i]; | ||
|
||
// Stores: | ||
|
||
std::iota(std::begin(data), std::end(data), gid * elems_per_wi); | ||
|
||
sycl_exp::group_store(g, span{data}, | ||
store_blocked_default.begin() + offset); | ||
sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset, | ||
blocked); | ||
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset, | ||
striped); | ||
}); | ||
}); | ||
|
||
host_accessor load_blocked_default{load_blocked_default_buf}; | ||
host_accessor load_blocked{load_blocked_buf}; | ||
host_accessor load_striped{load_striped_buf}; | ||
host_accessor store_blocked_default{store_blocked_default_buf}; | ||
host_accessor store_blocked{store_blocked_buf}; | ||
host_accessor store_striped{store_striped_buf}; | ||
|
||
// Check blocked. | ||
for (int i = 0; i < global_size * elems_per_wi; ++i) { | ||
assert(load_blocked_default[i] == i); | ||
assert(load_blocked[i] == i); | ||
assert(store_blocked_default[i] == i); | ||
assert(store_blocked[i] == i); | ||
} | ||
|
||
// Check striped. | ||
for (int wi = 0; wi < global_size; ++wi) { | ||
auto group = wi / wg_size; | ||
auto lid = wi % wg_size; | ||
|
||
for (auto elem = 0; elem < elems_per_wi; ++elem) { | ||
auto striped_idx = group * wg_size * elems_per_wi + elem * wg_size + lid; | ||
assert(load_striped[wi * elems_per_wi + elem] == striped_idx); | ||
|
||
auto value_stored = wi * elems_per_wi + elem; | ||
assert(store_striped[striped_idx] == value_stored); | ||
} | ||
} | ||
|
||
return 0; | ||
} |
49 changes: 49 additions & 0 deletions
49
sycl/test-e2e/GroupAlgorithm/load_store/conversions_load.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
// RUN: %{build} -o %t.out | ||
// RUN: %{run} %t.out | ||
|
||
#include <sycl/sycl.hpp> | ||
|
||
struct S { | ||
S() : i(-1) {} | ||
S(int i) : i(i + 42) {} | ||
|
||
int i; | ||
}; | ||
|
||
int main() { | ||
using namespace sycl; | ||
namespace sycl_exp = sycl::ext::oneapi::experimental; | ||
|
||
constexpr std::size_t wg_size = 16; | ||
|
||
queue q; | ||
|
||
buffer<int, 1> input_buf{wg_size * 2}; | ||
{ | ||
host_accessor acc{input_buf}; | ||
std::iota(acc.begin(), acc.end(), 0); | ||
} | ||
buffer<bool, 1> success_buf{wg_size}; | ||
|
||
q.submit([&](handler &cgh) { | ||
accessor input{input_buf, cgh}; | ||
accessor success{success_buf, cgh}; | ||
cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) { | ||
auto gid = ndi.get_global_id(0); | ||
auto g = ndi.get_group(); | ||
|
||
S data[2]; | ||
sycl_exp::group_load(g, input.begin(), span{data}); | ||
|
||
bool ok = true; | ||
ok &= (data[0].i == gid * 2 + 0 + 42); | ||
ok &= (data[1].i == gid * 2 + 1 + 42); | ||
success[gid] = ok; | ||
}); | ||
}); | ||
|
||
for (bool wi_success : host_accessor{success_buf}) | ||
assert(wi_success); | ||
|
||
return 0; | ||
} |
Oops, something went wrong.