diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp new file mode 100644 index 00000000..8b3b3285 --- /dev/null +++ b/benchmarks/BenchmarkRaja.cpp @@ -0,0 +1,106 @@ +////////////////////////////////////////////////////////////////////////////////////// +// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers. +// See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////////////// + +// CARE headers +#include "care/DefaultMacros.h" +#include "care/host_device_ptr.h" +#include "care/forall.h" +#include "care/policies.h" +#include "RAJA/RAJA.hpp" + +// Other library headers +#include +#include + +// Std library headers +#include +#include + +#define size 1000000 + +#if defined(CARE_GPUCC) +//each kernel has a separate stream +static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { + int N = state.range(0); + care::Resource res_arr[16]; + RAJA::resources::Event event_arr[16]; + care::host_device_ptr arrays[16]; + for(int i = 0; i < N; i++) + { + res_arr[i] = care::Resource(); + event_arr[i] = res_arr[i].get_event(); + arrays[i] = care::host_device_ptr(size, "arr"); + } + + //warmup kernel + CARE_GPU_LOOP(i, 0 , size) { + arrays[0][i] = 0; + } CARE_GPU_LOOP_END + + care::gpuDeviceSynchronize(__FILE__, __LINE__); + + for (auto _ : state) { + //run num kernels + omp_set_num_threads(N); + #pragma omp parallel for + for(int j = 0; j < N; j++) + { + CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) { + arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j); + } CARE_STREAMED_LOOP_END + } + care::gpuDeviceSynchronize(__FILE__, __LINE__); + } + + for(int i = 0; i < N; i++){ + arrays[i].free(); + } +} + +// Register the function as a benchmark +BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16); + +//all kernels on one stream +static void benchmark_gpu_loop_single_stream(benchmark::State& state) { + int N = state.range(0); + + care::host_device_ptr arrays[16]; + for(int i = 0; i < N; i++) + { + arrays[i] = care::host_device_ptr(size, "arr"); + } + + //warmup kernel + CARE_GPU_LOOP(i, 0, size) { + arrays[0][i] = 0; + } CARE_GPU_LOOP_END + + care::gpuDeviceSynchronize(__FILE__, __LINE__); + + for (auto _ : state) { + //run num kernels + for(int j = 0; j < N; j++) + { + CARE_GPU_LOOP(i, 0, size) { + arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j); + } CARE_GPU_LOOP_END + } + care::gpuDeviceSynchronize(__FILE__, __LINE__); + } + + for(int i = 0; i < N; i++){ + arrays[i].free(); + } +} + +// Register the function as a benchmark +BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16); + +#endif + +// Run the benchmarks +BENCHMARK_MAIN(); diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 5f798b75..cfde099a 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall blt_add_benchmark(NAME BenchmarkForall COMMAND BenchmarkForall) +blt_add_executable(NAME BenchmarkRaja + SOURCES BenchmarkRaja.cpp + DEPENDS_ON ${care_benchmark_depends}) + +target_include_directories(BenchmarkRaja + PRIVATE ${PROJECT_SOURCE_DIR}/src) + +target_include_directories(BenchmarkRaja + PRIVATE ${PROJECT_BINARY_DIR}/include) + +blt_add_benchmark(NAME BenchmarkRaja + COMMAND BenchmarkRaja) + blt_add_executable(NAME BenchmarkNumeric SOURCES BenchmarkNumeric.cpp DEPENDS_ON ${care_benchmark_depends}) diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h index 1de5ec51..6e055d25 100644 --- a/src/care/DefaultMacros.h +++ b/src/care/DefaultMacros.h @@ -261,6 +261,10 @@ #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) +#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) + +#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -548,6 +552,15 @@ #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} +#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) { \ + if (END_INDEX > START_INDEX) { \ + CARE_NEST_BEGIN(CHECK) \ + care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \ + CARE_SET_THREAD_ID(INDEX) + +#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \ + CARE_NEST_END(CHECK) }} + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -753,6 +766,10 @@ #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check) +#define CARE_STREAMED_LOOP(RESOURCE, INDEX, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, care_streamed_loop_check) + +#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a RAJA loop that uses at least one diff --git a/src/care/forall.h b/src/care/forall.h index d45433cc..20036cd4 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -30,22 +30,26 @@ namespace care { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS static bool s_reverseLoopOrder = false; #endif - + template struct ExecutionPolicyToSpace { static constexpr const chai::ExecutionSpace value = chai::CPU; }; #if defined(__CUDACC__) + typedef RAJA::resources::Cuda Resource; template <> struct ExecutionPolicyToSpace> { static constexpr const chai::ExecutionSpace value = chai::GPU; }; #elif defined (__HIPCC__) + typedef RAJA::resources::Hip Resource; template <> struct ExecutionPolicyToSpace> { static constexpr const chai::ExecutionSpace value = chai::GPU; }; +#else + typedef RAJA::resources::Host Resource; #endif #if CARE_ENABLE_GPU_SIMULATION_MODE @@ -97,6 +101,50 @@ namespace care { } } + //////////////////////////////////////////////////////////////////////////////// + /// + /// @author Peter Robinson, Alan Dayton + /// + /// @brief Loops over the given indices and calls the loop body with each index. + /// This overload is CHAI and RAJA aware and sets the execution space accordingly. + /// + /// @arg[in] policy Used to choose this overload of forall + /// @arg[in] res Resource to be used + /// @arg[in] fileName The name of the file where this function is called + /// @arg[in] lineNumber The line number in the file where this function is called + /// @arg[in] start The starting index (inclusive) + /// @arg[in] end The ending index (exclusive) + /// @arg[in] body The loop body to execute at each index + /// + //////////////////////////////////////////////////////////////////////////////// + template + void forall(ExecutionPolicy /* policy */, R res, const char * fileName, const int lineNumber, + const int start, const int end, LB&& body) { + const int length = end - start; + + if (length != 0) { + PluginData::setFileName(fileName); + PluginData::setLineNumber(lineNumber); + + +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + RAJA::RangeStrideSegment rangeSegment = + s_reverseLoopOrder ? + RAJA::RangeStrideSegment(end - 1, start - 1, -1) : + RAJA::RangeStrideSegment(start, end, 1); +#else + RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(start, end); +#endif + +#if CARE_ENABLE_GPU_SIMULATION_MODE + RAJA::forall(res, rangeSegment, std::forward(body)); +#else + RAJA::forall(res, rangeSegment, std::forward(body)); +#endif + } + } + + //////////////////////////////////////////////////////////////////////////////// /// /// @author Alan Dayton @@ -189,6 +237,49 @@ namespace care { #endif } + //////////////////////////////////////////////////////////////////////////////// + /// + /// @author Neela Kausik + /// + /// @brief If GPU is available, execute on the device. Otherwise, execute on + /// the host. This specialization is needed for clang-query. + /// + /// @arg[in] gpu Used to choose this overload of forall + /// @arg[in] res Resource provided for execution + /// @arg[in] fileName The name of the file where this function is called + /// @arg[in] lineNumber The line number in the file where this function is called + /// @arg[in] start The starting index (inclusive) + /// @arg[in] end The ending index (exclusive) + /// @arg[in] body The loop body to execute at each index + /// + //////////////////////////////////////////////////////////////////////////////// + +#if defined(CARE_GPUCC) + template + void forall_with_stream(gpu, Resource res, const char * fileName, const int lineNumber, + const int start, const int end, LB&& body) { +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + s_reverseLoopOrder = true; +#endif + +#if CARE_ENABLE_GPU_SIMULATION_MODE + forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward(body)); +#elif defined(__CUDACC__) + forall(RAJA::cuda_exec{}, + res, fileName, lineNumber, start, end, std::forward(body)); +#elif defined(__HIPCC__) + forall(RAJA::hip_exec{}, + res, fileName, lineNumber, start, end, std::forward(body)); +#else + forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward(body)); +#endif + +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + s_reverseLoopOrder = false; +#endif + } +#endif + //////////////////////////////////////////////////////////////////////////////// /// /// @author Alan Dayton