Skip to content

Commit

Permalink
Remove unneeded headers and unused variables; other cleanup (#1784)
Browse files Browse the repository at this point in the history
Co-authored-by: Dmitriy Sobolev <[email protected]>
  • Loading branch information
akukanov and dmitriy-sobolev authored Oct 3, 2024
1 parent 43aa2ae commit 29cd230
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 48 deletions.
16 changes: 8 additions & 8 deletions include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -1691,14 +1691,14 @@ __pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _F
_ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2,
_OutputIterator __result, _Compare __comp, _IsOpDifference)
{
typedef typename ::std::iterator_traits<_ForwardIterator1>::difference_type _Size1;
typedef typename ::std::iterator_traits<_ForwardIterator2>::difference_type _Size2;
typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _Size1;
typedef typename std::iterator_traits<_ForwardIterator2>::difference_type _Size2;

const _Size1 __n1 = __last1 - __first1;
const _Size2 __n2 = __last2 - __first2;

//Algo is based on the recommended approach of set_intersection algo for GPU: binary search + scan (copying by mask).
using _ReduceOp = ::std::plus<_Size1>;
using _ReduceOp = std::plus<_Size1>;
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _MaskAssigner = unseq_backend::__mask_assigner<2>;
Expand All @@ -1708,7 +1708,7 @@ __pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _F
_ReduceOp __reduce_op;
_Assigner __assign_op;
_DataAcc __get_data_op;
unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ ::std::true_type, 2>
unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ std::true_type, 2>
__copy_by_mask_op;
unseq_backend::__brick_set_op<_ExecutionPolicy, _Compare, _Size1, _Size2, _IsOpDifference> __create_mask_op{
__comp, __n1, __n2};
Expand All @@ -1728,18 +1728,18 @@ __pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _F

auto __result_size =
__par_backend_hetero::__parallel_transform_scan_base(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
__buf1.all_view(), __buf2.all_view(),
oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
__buf3.all_view(), __reduce_op, _InitType{},
__buf3.all_view(), _InitType{},
// local scan
unseq_backend::__scan</*inclusive*/ ::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
unseq_backend::__scan</*inclusive*/ std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
_MaskAssigner, decltype(__create_mask_op), _InitType>{
__reduce_op, __get_data_op, __assign_op, _MaskAssigner{}, __create_mask_op},
// scan between groups
unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign,
unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign,
_Assigner, _DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op,
__get_data_op},
// global scan
Expand Down
26 changes: 13 additions & 13 deletions include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -932,13 +932,13 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&

oneapi::dpl::__internal::__ranges::__pattern_walk_n(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_wrapper>(__exec), __copy_range,
::std::forward<_Range1>(__keys), ::std::forward<_Range3>(__out_keys));
std::forward<_Range1>(__keys), std::forward<_Range3>(__out_keys));

oneapi::dpl::__internal::__ranges::__pattern_walk_n(
__tag,
oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_values_wrapper>(
::std::forward<_ExecutionPolicy>(__exec)),
__copy_range, ::std::forward<_Range2>(__values), ::std::forward<_Range4>(__out_values));
std::forward<_ExecutionPolicy>(__exec)),
__copy_range, std::forward<_Range2>(__values), std::forward<_Range4>(__out_values));

return 1;
}
Expand Down Expand Up @@ -968,7 +968,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
experimental::ranges::views::all_write(__idx));

// use work group size adjusted to shared local memory as the maximum segment size.
::std::size_t __wgroup_size =
std::size_t __wgroup_size =
oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type));

// element is copied if it is the 0th element (marks beginning of first segment), is in an index
Expand All @@ -977,14 +977,14 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
// TODO: replace wgroup size with segment size based on platform specifics.
auto __intermediate_result_end = __ranges::__pattern_copy_if(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2,
[__n, __binary_pred, __wgroup_size](const auto& __a) {
[__binary_pred, __wgroup_size](const auto& __a) {
// The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
// for (i-1), but we still need to get its key value as it is the start of a segment
const auto index = ::std::get<0>(__a);
const auto index = std::get<0>(__a);
if (index == 0)
return true;
return index % __wgroup_size == 0 // segment size
|| !__binary_pred(::std::get<1>(__a), ::std::get<2>(__a)); // key comparison
return index % __wgroup_size == 0 // segment size
|| !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison
},
unseq_backend::__brick_assign_key_position{});

Expand All @@ -994,7 +994,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx),
__intermediate_result_end),
::std::forward<_Range2>(__values), experimental::ranges::views::all_write(__tmp_out_values))
std::forward<_Range2>(__values), experimental::ranges::views::all_write(__tmp_out_values))
.wait();

// Round 2: final reduction to get result for each segment of equal adjacent keys
Expand Down Expand Up @@ -1023,22 +1023,22 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
[__binary_pred](const auto& __a) {
// The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
// for (i-1), but we still need to get its key value as it is the start of a segment
if (::std::get<0>(__a) == 0)
if (std::get<0>(__a) == 0)
return true;
return !__binary_pred(::std::get<1>(__a), ::std::get<2>(__a)); // keys comparison
return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison
},
unseq_backend::__brick_assign_key_position{});

//reduce by segment
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{},
oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
::std::forward<_ExecutionPolicy>(__exec)),
std::forward<_ExecutionPolicy>(__exec)),
unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
__binary_op, __intermediate_result_end),
__result_end,
oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), __result_end),
experimental::ranges::views::all_read(__tmp_out_values), ::std::forward<_Range4>(__out_values))
experimental::ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values))
.__deferrable_wait();

return __result_end;
Expand Down
44 changes: 21 additions & 23 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,11 @@ struct __parallel_scan_submitter;
template <typename _CustomName, typename... _PropagateScanName>
struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name<_PropagateScanName...>>
{
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryOperation,
typename _InitType, typename _LocalScan, typename _GroupScan, typename _GlobalScan>
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType,
typename _LocalScan, typename _GroupScan, typename _GlobalScan>
auto
operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _BinaryOperation __binary_op,
_InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) const
operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _InitType __init,
_LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) const
{
using _Type = typename _InitType::__value_type;
using _LocalScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
Expand Down Expand Up @@ -738,11 +738,11 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
}
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryOperation, typename _InitType,
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType,
typename _LocalScan, typename _GroupScan, typename _GlobalScan>
auto
__parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __in_rng, _Range2&& __out_rng, _BinaryOperation __binary_op, _InitType __init,
_Range1&& __in_rng, _Range2&& __out_rng, _InitType __init,
_LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan)
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
Expand All @@ -751,8 +751,8 @@ __parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _E
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__scan_propagate_kernel<_CustomName>>;

return __parallel_scan_submitter<_CustomName, _PropagateKernel>()(
::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng), ::std::forward<_Range2>(__out_rng),
__binary_op, __init, __local_scan, __group_scan, __global_scan);
std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng),
__init, __local_scan, __group_scan, __global_scan);
}

template <typename _Type>
Expand All @@ -761,8 +761,7 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_
std::size_t __single_group_upper_limit)
{
// Pessimistically only use half of the memory to take into account memory used by compiled kernel
const ::std::size_t __max_slm_size =
__queue.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
const std::size_t __max_slm_size = __queue.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
const auto __req_slm_size = sizeof(_Type) * __n_uniform;

return (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size);
Expand Down Expand Up @@ -906,7 +905,7 @@ template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typenam
typename _BinaryOperation, typename _Inclusive>
auto
__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __in_rng, _Range2&& __out_rng, ::std::size_t __n, _UnaryOperation __unary_op,
_Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op,
_InitType __init, _BinaryOperation __binary_op, _Inclusive)
{
using _Type = typename _InitType::__value_type;
Expand Down Expand Up @@ -961,7 +960,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen

return __parallel_transform_scan_base(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __binary_op, __init,
std::forward<_Range2>(__out_rng), __init,
// local scan
unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
_NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
Expand Down Expand Up @@ -1047,7 +1046,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op,
_CopyByMaskOp __copy_by_mask_op)
{
using _ReduceOp = ::std::plus<_Size>;
using _ReduceOp = std::plus<_Size>;
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _MaskAssigner = unseq_backend::__mask_assigner<1>;
Expand All @@ -1063,19 +1062,18 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n);

return __parallel_transform_scan_base(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::zip_view(
__in_rng, oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
::std::forward<_OutRng>(__out_rng), __reduce_op, _InitType{},
std::forward<_OutRng>(__out_rng), _InitType{},
// local scan
unseq_backend::__scan</*inclusive*/ ::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
unseq_backend::__scan</*inclusive*/ std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
_MaskAssigner, _CreateMaskOp, _InitType>{__reduce_op, __get_data_op, __assign_op,
__add_mask_op, __create_mask_op},
// scan between groups
unseq_backend::__scan</*inclusive*/ ::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign,
_Assigner, _DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op,
__get_data_op},
unseq_backend::__scan</*inclusive*/ std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign, _Assigner,
_DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, __get_data_op},
// global scan
__copy_by_mask_op);
}
Expand Down Expand Up @@ -1154,16 +1152,16 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;

// Next power of 2 greater than or equal to __n
auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast<::std::make_unsigned_t<_Size>>(__n));
auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast<std::make_unsigned_t<_Size>>(__n));

// Pessimistically only use half of the memory to take into account memory used by compiled kernel
const ::std::size_t __max_slm_size =
const std::size_t __max_slm_size =
__exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;

// The kernel stores n integers for the predicate and another n integers for the offsets
const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;
const auto __req_slm_size = sizeof(std::uint16_t) * __n_uniform * 2;

constexpr ::std::uint16_t __single_group_upper_limit = 2048;
constexpr std::uint16_t __single_group_upper_limit = 2048;

std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);

Expand Down
2 changes: 1 addition & 1 deletion include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@

#include "../../utils_ranges.h"
#include "../../iterator_impl.h"
#include "../../glue_numeric_defs.h"
#include "sycl_iterator.h"
#include "sycl_defs.h"
#include "execution_sycl_defs.h"

namespace oneapi
{
Expand Down
2 changes: 1 addition & 1 deletion include/oneapi/dpl/pstl/iterator_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ struct ignore_copyable
}

bool
operator==(const ignore_copyable& other) const
operator==(const ignore_copyable&) const
{
return true;
}
Expand Down
1 change: 0 additions & 1 deletion include/oneapi/dpl/pstl/utils_ranges.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

#include "iterator_defs.h"
#include "iterator_impl.h"
#include "execution_defs.h"

namespace oneapi
{
Expand Down
2 changes: 1 addition & 1 deletion test/support/test_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
(!PSTL_USE_DEBUG && (__linux__ || __APPLE__) && __INTEL_COMPILER == 1900)
// ICC 19 generates wrong result with UDS on Windows
#define _PSTL_ICC_19_TEST_SIMD_UDS_WINDOWS_RELEASE_BROKEN (__INTEL_COMPILER == 1900 && _MSC_VER && !_DEBUG)
// ICPC compiler generates wrong "openMP simd" code for a user defined scan operation(UDS) for MacOS, Linux and Windows
// ICPC compiler generates wrong "openMP simd" code for a user defined scan operation(UDS)
#define _PSTL_ICC_TEST_SIMD_UDS_BROKEN \
(__INTEL_COMPILER && __INTEL_COMPILER_BUILD_DATE < 20211123)
// ICC 18,19 generate wrong result
Expand Down

0 comments on commit 29cd230

Please sign in to comment.