diff --git a/libcudacxx/include/cuda/std/__type_traits/address_stability.h b/libcudacxx/include/cuda/std/__type_traits/address_stability.h new file mode 100644 index 00000000000..939b15bfa59 --- /dev/null +++ b/libcudacxx/include/cuda/std/__type_traits/address_stability.h @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H +#define _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// need a separate implementation trait because we SFINAE with a type parameter before the variadic pack +template +struct __allows_copied_arguments_impl : false_type +{}; + +template +struct __allows_copied_arguments_impl, Args...> +{ + static constexpr bool value = F::allows_copied_arguments; +}; + +//! Trait telling whether a function object relies on the memory address of its arguments when called with the given set +//! of types. The nested value is true when the addresses of the arguments do not matter and arguments can be provided +//! from arbitrary copies of the respective sources. Can be specialized for custom function objects and parameter types. +template +struct allows_copied_arguments : __allows_copied_arguments_impl +{}; + +#if _CCCL_STD_VER >= 2014 +template +_CCCL_GLOBAL_CONSTANT bool allows_copied_arguments_v = allows_copied_arguments::value; +#endif // _CCCL_STD_VER >= 2014 + +//! Wrapper for a callable to mark it as allowing copied arguments +template +struct callable_allowing_copied_arguments : F +{ + using F::operator(); + static constexpr bool allows_copied_arguments = true; +}; + +//! Creates a new function object from an existing one, allowing its arguments to be copies of whatever source they come +//! from. This implies that the addresses of the arguments are irrelevant to the function object. +template +_CCCL_HOST_DEVICE constexpr auto allow_copied_arguments(F f) -> callable_allowing_copied_arguments +{ + return callable_allowing_copied_arguments{_CUDA_VSTD::move(f)}; +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H diff --git a/libcudacxx/include/cuda/std/type_traits b/libcudacxx/include/cuda/std/type_traits index 09729c71800..566cb278a5b 100644 --- a/libcudacxx/include/cuda/std/type_traits +++ b/libcudacxx/include/cuda/std/type_traits @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu index 088517f87b3..c146a0209c2 100644 --- a/thrust/benchmarks/bench/transform/basic.cu +++ b/thrust/benchmarks/bench/transform/basic.cu @@ -33,6 +33,9 @@ #include #include +#include +#include + #include template @@ -121,9 +124,10 @@ static void mul(nvbench::state& state, nvbench::type_list) state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { const T scalar = startScalar; - thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) { - return ci * scalar; - }); + thrust::transform( + c.begin(), c.end(), b.begin(), cuda::std::allow_copied_arguments([=] __device__ __host__(const T& ci) { + return ci * scalar; + })); }); } @@ -145,9 +149,14 @@ static void add(nvbench::state& state, nvbench::type_list) state.add_global_memory_writes(n); state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { - thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) { - return ai + bi; - }); + thrust::transform( + a.begin(), + a.end(), + b.begin(), + c.begin(), + cuda::std::allow_copied_arguments([] _CCCL_DEVICE(const T& ai, const T& bi) -> T { + return ai + bi; + })); }); } @@ -170,9 +179,14 @@ static void triad(nvbench::state& state, nvbench::type_list) state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { const T scalar = startScalar; - thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) { - return bi + scalar * ci; - }); + thrust::transform( + b.begin(), + b.end(), + c.begin(), + a.begin(), + cuda::std::allow_copied_arguments([=] _CCCL_DEVICE(const T& bi, const T& ci) { + return bi + scalar * ci; + })); }); } @@ -199,9 +213,11 @@ static void nstream(nvbench::state& state, nvbench::type_list) thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()), thrust::make_zip_iterator(a.end(), b.end(), c.end()), a.begin(), - thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) { - return ai + bi + scalar * ci; - })); + + thrust::make_zip_function( + cuda::std::allow_copied_arguments([=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) { + return ai + bi + scalar * ci; + }))); }); } diff --git a/thrust/testing/address_stability.cu b/thrust/testing/address_stability.cu new file mode 100644 index 00000000000..d589a9c0c86 --- /dev/null +++ b/thrust/testing/address_stability.cu @@ -0,0 +1,26 @@ +#include + +#include + +// TODO(bgruber): move this test into libcu++ + +struct my_plus +{ + _CCCL_HOST_DEVICE auto operator()(int a, int b) const -> int + { + return a + b; + } +}; + +void TestAddressStability() +{ + using ::cuda::std::allow_copied_arguments; + using ::cuda::std::allows_copied_arguments; + + static_assert(!allows_copied_arguments, int, int>::value, ""); + static_assert(allows_copied_arguments{})), int, int>::value, ""); + + static_assert(!allows_copied_arguments::value, ""); + static_assert(allows_copied_arguments::value, ""); +} +DECLARE_UNITTEST(TestAddressStability); diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu index 3fd3ed22470..9a41275d14d 100644 --- a/thrust/testing/cuda/transform.cu +++ b/thrust/testing/cuda/transform.cu @@ -413,3 +413,76 @@ void TestTransformBinaryCudaStreams() cudaStreamDestroy(s); } DECLARE_UNITTEST(TestTransformBinaryCudaStreams); + +struct sum_five +{ + _CCCL_HOST_DEVICE auto + operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double + { + return a + b + c + d + e; + } +}; + +// we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail +THRUST_NAMESPACE_BEGIN +template <> +class zip_function +{ +public: + _CCCL_HOST_DEVICE zip_function(sum_five func) + : func(func) + {} + + _CCCL_HOST_DEVICE sum_five& underlying_function() const + { + return func; + } + + template + _CCCL_HOST_DEVICE auto + operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval(), THRUST_FWD(t))) + { + // not calling func, so we would get a wrong result if we were called + return {}; + } + +private: + mutable sum_five func; +}; +THRUST_NAMESPACE_END + +// test that the cuda_cub backend of Thrust unwraps zip_iterators/zip_functions into their input streams +void TestTransformZipIteratorUnwrapping() +{ + constexpr int num_items = 100; + thrust::device_vector a(num_items, 1); + thrust::device_vector b(num_items, 2); + thrust::device_vector c(num_items, 3); + thrust::device_vector d(num_items, 4); + thrust::device_vector e(num_items, 5); + + thrust::device_vector result(num_items); + // SECTION("once") // TODO(bgruber): enable sections when we migrate to Catch2 + { + const auto z = thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()); + thrust::transform(z, z + num_items, result.begin(), thrust::make_zip_function(sum_five{})); + + // compute reference and verify + thrust::device_vector reference(num_items, 1 + 2 + 3 + 4 + 5); + ASSERT_EQUAL(reference, result); + } + // SECTION("trice") + { + const auto z = thrust::make_zip_iterator( + thrust::make_zip_iterator(thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()))); + thrust::transform(z, + z + num_items, + result.begin(), + thrust::make_zip_function(thrust::make_zip_function(thrust::make_zip_function(sum_five{})))); + + // compute reference and verify + thrust::device_vector reference(num_items, 1 + 2 + 3 + 4 + 5); + ASSERT_EQUAL(reference, result); + } +} +DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping); diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h index 3f4f148f5d4..75e04d58306 100644 --- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h +++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h @@ -50,6 +50,10 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { +// Need a forward declaration here to work around a cyclic include, since "cuda/detail/transform.h" includes this header +template +OutputIt THRUST_FUNCTION +transform(execution_policy& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op); namespace __copy { diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h index 9e1d0b2a318..b2b3909d1fe 100644 --- a/thrust/thrust/system/cuda/detail/transform.h +++ b/thrust/thrust/system/cuda/detail/transform.h @@ -39,26 +39,32 @@ #ifdef _CCCL_CUDA_COMPILER # include -# include +# include + # include +# include +# include # include # include +# include + +# include + +# include THRUST_NAMESPACE_BEGIN namespace cuda_cub { - namespace __transform { - struct no_stencil_tag {}; struct always_true_predicate { template - bool THRUST_DEVICE_FUNCTION operator()(T const&) const + constexpr bool THRUST_DEVICE_FUNCTION operator()(T const&) const { return true; } @@ -231,6 +237,66 @@ OutputIt THRUST_FUNCTION binary( return result + num_items; } +_CCCL_EXEC_CHECK_DISABLE +template +OutputIt THRUST_FUNCTION cub_transform_many( + execution_policy& policy, + ::cuda::std::tuple firsts, + OutputIt result, + Offset num_items, + TransformOp transform_op) +{ + if (num_items == 0) + { + return result; + } + + // TODO(bgruber): iterator_reference_t or iterator_value_type? + constexpr auto requires_stable_address = + !::cuda::std::allows_copied_arguments...>::value; + using dispatch32_t = cub::detail::transform:: + dispatch_t, OutputIt, TransformOp>; + using dispatch64_t = cub::detail::transform:: + dispatch_t, OutputIt, TransformOp>; + + cudaError_t status; + THRUST_INDEX_TYPE_DISPATCH2( + status, + dispatch32_t::dispatch, + dispatch64_t::dispatch, + num_items, + (firsts, result, num_items_fixed, transform_op, cuda_cub::stream(policy))); + throw_on_error(status, "transform: failed inside CUB"); + + status = cuda_cub::synchronize_optional(policy); + throw_on_error(status, "transform: failed to synchronize"); + + return result + num_items; +} + +template +THRUST_FUNCTION auto +convert_to_std_tuple(tuple t, ::cuda::std::index_sequence) -> ::cuda::std::tuple +{ + return ::cuda::std::tuple{get(t)...}; +} + +// unwrap zip_iterator and zip_function into their underlying iterators so cub::DeviceTransform can optimize them +template +OutputIt THRUST_FUNCTION cub_transform_many( + execution_policy& policy, + ::cuda::std::tuple>> firsts, + OutputIt result, + Offset num_items, + zip_function transform_op) +{ + return cub_transform_many( + policy, + convert_to_std_tuple(get<0>(firsts).get_iterator_tuple(), ::cuda::std::index_sequence_for{}), + result, + num_items, + transform_op.underlying_function()); +} } // namespace __transform //------------------------- @@ -272,7 +338,15 @@ template OutputIt THRUST_FUNCTION transform(execution_policy& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op) { - return cuda_cub::transform_if(policy, first, last, result, transform_op, __transform::always_true_predicate()); + THRUST_CDP_DISPATCH( + (using size_type = typename iterator_traits::difference_type; + const auto num_items = static_cast(thrust::distance(first, last)); + return __transform::cub_transform_many(policy, ::cuda::std::make_tuple(first), result, num_items, transform_op);), + (while (first != last) { + *result = transform_op(raw_reference_cast(*first)); + ++first; + ++result; + } return result;)); } // func transform //------------------------- @@ -310,17 +384,18 @@ OutputIt THRUST_FUNCTION transform( OutputIt result, TransformOp transform_op) { - return cuda_cub::transform_if( - policy, - first1, - last1, - first2, - __transform::no_stencil_tag(), - result, - transform_op, - __transform::always_true_predicate()); -} // func transform - + THRUST_CDP_DISPATCH( + (using size_type = typename iterator_traits::difference_type; + const auto num_items = static_cast(thrust::distance(first1, last1)); + return __transform::cub_transform_many( + policy, ::cuda::std::make_tuple(first1, first2), result, num_items, transform_op);), + (while (first1 != last1) { + *result = transform_op(raw_reference_cast(*first1), raw_reference_cast(*first2)); + ++first1; + ++first2; + ++result; + } return result;)); +} } // namespace cuda_cub THRUST_NAMESPACE_END