Skip to content

Commit

Permalink
Make thrust::transform use cub::DeviceTransform
Browse files Browse the repository at this point in the history
* Introduces address stability detection and opt-in in libcu++
* Mark lambdas in Thrust BabelStream benchmark address oblivious

Fixes: NVIDIA#2263
  • Loading branch information
bernhardmgruber committed Sep 9, 2024
1 parent 14c51a0 commit c662a2f
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 27 deletions.
70 changes: 70 additions & 0 deletions libcudacxx/include/cuda/std/__type_traits/address_stability.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H
#define _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__type_traits/void_t.h>
#include <cuda/std/__utility/move.h>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

// need a separate implementation trait because we SFINAE with a type parameter before the variadic pack
template <typename F, typename SFINAE, typename... Args>
struct __allows_copied_arguments_impl : _CUDA_VSTD::false_type
{};

template <typename F, typename... Args>
struct __allows_copied_arguments_impl<F, _CUDA_VSTD::void_t<decltype(F::allows_copied_arguments)>, Args...>
{
static constexpr bool value = F::allows_copied_arguments;
};

//! Trait telling whether a function object relies on the memory address of its arguments when called with the given set
//! of types. The nested value is true when the addresses of the arguments do not matter and arguments can be provided
//! from arbitrary copies of the respective sources. Can be specialized for custom function objects and parameter types.
template <typename F, typename... Args>
struct allows_copied_arguments : __allows_copied_arguments_impl<F, void, Args...>
{};

#if _CCCL_STD_VER >= 2014
template <typename F, typename... Args>
_LIBCUDACXX_INLINE_VAR constexpr bool allows_copied_arguments_v = allows_copied_arguments<F, Args...>::value;
#endif // _CCCL_STD_VER >= 2014

//! Wrapper for a callable to mark it as allowing copied arguments
template <typename F>
struct callable_allowing_copied_arguments : F
{
using F::operator();
static constexpr bool allows_copied_arguments = true;
};

//! Creates a new function object from an existing one, allowing its arguments to be copies of whatever source they come
//! from. This implies that the addresses of the arguments are irrelevant to the function object.
template <typename F>
_CCCL_HOST_DEVICE constexpr auto allow_copied_arguments(F f) -> callable_allowing_copied_arguments<F>
{
return callable_allowing_copied_arguments<F>{_CUDA_VSTD::move(f)};
}

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/type_traits
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <cuda/std/__type_traits/add_pointer.h>
#include <cuda/std/__type_traits/add_rvalue_reference.h>
#include <cuda/std/__type_traits/add_volatile.h>
#include <cuda/std/__type_traits/address_stability.h>
#include <cuda/std/__type_traits/aligned_storage.h>
#include <cuda/std/__type_traits/aligned_union.h>
#include <cuda/std/__type_traits/alignment_of.h>
Expand Down
32 changes: 21 additions & 11 deletions thrust/benchmarks/bench/transform/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
#include <thrust/transform.h>
#include <thrust/zip_function.h>

#include <cuda/functional>
#include <cuda/std/__type_traits/address_stability.h>

#include <nvbench_helper.cuh>

template <class InT, class OutT>
Expand Down Expand Up @@ -121,9 +124,9 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
const T scalar = startScalar;
thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) {
return ci * scalar;
});
thrust::transform(c.begin(), c.end(), b.begin(), cuda::allow_copied_arguments([=] __device__ __host__(const T& ci) {
return ci * scalar;
}));
});
}

Expand All @@ -145,9 +148,14 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(n);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) {
return ai + bi;
});
thrust::transform(
a.begin(),
a.end(),
b.begin(),
c.begin(),
cuda::allow_copied_arguments([] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
return ai + bi;
}));
});
}

Expand All @@ -170,9 +178,10 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
const T scalar = startScalar;
thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) {
return bi + scalar * ci;
});
thrust::transform(
b.begin(), b.end(), c.begin(), a.begin(), cuda::allow_copied_arguments([=] _CCCL_DEVICE(const T& bi, const T& ci) {
return bi + scalar * ci;
}));
});
}

Expand All @@ -199,9 +208,10 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()),
thrust::make_zip_iterator(a.end(), b.end(), c.end()),
a.begin(),
thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) {

thrust::make_zip_function(cuda::allow_copied_arguments([=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
}));
})));
});
}

Expand Down
26 changes: 26 additions & 0 deletions thrust/testing/address_stability.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include <cuda/std/__type_traits/address_stability.h>

#include <unittest/unittest.h>

// TODO(bgruber): move this test into libcu++

struct my_plus
{
_CCCL_HOST_DEVICE auto operator()(int a, int b) const -> int
{
return a + b;
}
};

void TestAddressStability()
{
using ::cuda::allow_copied_arguments;
using ::cuda::allows_copied_arguments;

static_assert(!allows_copied_arguments<thrust::plus<int>, int, int>::value, "");
static_assert(allows_copied_arguments<decltype(allow_copied_arguments(thrust::plus<int>{})), int, int>::value, "");

static_assert(!allows_copied_arguments<my_plus, int, int>::value, "");
static_assert(allows_copied_arguments<decltype(allow_copied_arguments(my_plus{})), int, int>::value, "");
}
DECLARE_UNITTEST(TestAddressStability);
73 changes: 73 additions & 0 deletions thrust/testing/cuda/transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -413,3 +413,76 @@ void TestTransformBinaryCudaStreams()
cudaStreamDestroy(s);
}
DECLARE_UNITTEST(TestTransformBinaryCudaStreams);

struct sum_five
{
_CCCL_HOST_DEVICE auto
operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
{
return a + b + c + d + e;
}
};

// we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail
THRUST_NAMESPACE_BEGIN
template <>
class zip_function<sum_five>
{
public:
_CCCL_HOST_DEVICE zip_function(sum_five func)
: func(func)
{}

_CCCL_HOST_DEVICE sum_five& underlying_function() const
{
return func;
}

template <typename Tuple>
_CCCL_HOST_DEVICE auto
operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval<sum_five>(), THRUST_FWD(t)))
{
// not calling func, so we would get a wrong result if we were called
return {};
}

private:
mutable sum_five func;
};
THRUST_NAMESPACE_END

// test that the cuda_cub backend of Thrust unwraps zip_iterators/zip_functions into their input streams
void TestTransformZipIteratorUnwrapping()
{
constexpr int num_items = 100;
thrust::device_vector<std::int8_t> a(num_items, 1);
thrust::device_vector<std::int16_t> b(num_items, 2);
thrust::device_vector<std::int32_t> c(num_items, 3);
thrust::device_vector<std::int64_t> d(num_items, 4);
thrust::device_vector<float> e(num_items, 5);

thrust::device_vector<double> result(num_items);
// SECTION("once") // TODO(bgruber): enable sections when we migrate to Catch2
{
const auto z = thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin());
thrust::transform(z, z + num_items, result.begin(), thrust::make_zip_function(sum_five{}));

// compute reference and verify
thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
ASSERT_EQUAL(reference, result);
}
// SECTION("trice")
{
const auto z = thrust::make_zip_iterator(
thrust::make_zip_iterator(thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin())));
thrust::transform(z,
z + num_items,
result.begin(),
thrust::make_zip_function(thrust::make_zip_function(thrust::make_zip_function(sum_five{}))));

// compute reference and verify
thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
ASSERT_EQUAL(reference, result);
}
}
DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping);
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
THRUST_NAMESPACE_BEGIN
namespace cuda_cub
{
// Need a forward declaration here to work around a cyclic include, since "cuda/detail/transform.h" includes this header
template <class Derived, class InputIt, class OutputIt, class TransformOp>
OutputIt THRUST_FUNCTION
transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);

namespace __copy
{
Expand Down
Loading

0 comments on commit c662a2f

Please sign in to comment.