LLNL · neelakausik · Jul 19, 2023 · Jul 19, 2023 · Jul 24, 2023 · Jul 24, 2023
diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
@@ -0,0 +1,105 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers.
+// See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////////////
+
+// CARE headers
+#include "care/DefaultMacros.h"
+#include "care/host_device_ptr.h"
+#include "care/forall.h"
+#include "care/policies.h"
+#include "RAJA/RAJA.hpp"
+
+// Other library headers
+#include <benchmark/benchmark.h>
+#include <omp.h>
+
+// Std library headers
+#include <climits>
+#include <cmath>
+
+#define size 1000000
+
+namespace care{
+
+#if defined(CARE_GPUCC)
+//each kernel has a separate stream
+static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
+   int N = state.range(0);
+   RAJA::resources::Cuda res_arr[N];
+   RAJA::resources::Event event_arr[N];
+   care::host_device_ptr<int> arrays[16];
+   for(int i = 0; i < N; i++)
+   {
+      RAJA::resources::Cuda res;
+      res_arr[i] = res;
+      RAJA::resources::Event e = res.get_event();
+      event_arr[i] = e;
+      care::host_device_ptr<int> arr(size, "arr");
+      arrays[i] = arr;
+   }
+
+   //warmup kernel
+   RAJA::resources::Cuda warmup_res;
+   CARE_STREAMED_LOOP(warmup_res, i, 0 , size) {
+      arrays[0][i] = 0;
+   } CARE_STREAMED_LOOP_END					
+
+   for (auto _ : state) {
+      //run num kernels
+      for(int j = 0; j < N; j++)
+      {
+         CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
+            arrays[j][i] = i;
+         } CARE_STREAMED_LOOP_END					
+      }
+   }
+   for(int i = 0; i < N; i++) {arrays[i].free();}
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);
+
+//all kernels on one stream
+static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
+   int N = state.range(0);	
+
+   RAJA::resources::Cuda res;   
+
+   care::host_device_ptr<int> arrays[16];
+   for(int i = 0; i < N; i++)
+   {
+      care::host_device_ptr<int> arr(size, "arr");
+      arrays[i] = arr;
+   }
+
+   //warmup kernel
+   RAJA::resources::Cuda warmup_res;
+   CARE_STREAMED_LOOP(warmup_res, i, 0, size) {
+      arrays[0][i] = i;
+   } CARE_STREAMED_LOOP_END
+
+   for (auto _ : state) {
+      //run num kernels
+      for(int j = 0; j < N; j++)
+      {
+         CARE_STREAMED_LOOP(res, i, 0, size) {
+            arrays[j][i] = i;
+         } CARE_STREAMED_LOOP_END
+         res.wait();
+      }
+   }
+   for(int i = 0; i < N; i++) {arrays[i].free();}
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);
+
+#endif
+
+} //namespace care
+
+// Run the benchmarks
+BENCHMARK_MAIN();
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall
 blt_add_benchmark(NAME BenchmarkForall
                   COMMAND BenchmarkForall)
 
+blt_add_executable(NAME BenchmarkRaja
+                   SOURCES BenchmarkRaja.cpp
+                   DEPENDS_ON ${care_benchmark_depends})
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_SOURCE_DIR}/src)
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_BINARY_DIR}/include)
+
+blt_add_benchmark(NAME BenchmarkRaja
+                  COMMAND BenchmarkRaja)
+
 blt_add_executable(NAME BenchmarkNumeric
                    SOURCES BenchmarkNumeric.cpp
                    DEPENDS_ON ${care_benchmark_depends})

diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h
@@ -261,6 +261,10 @@
 
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
 
+#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK)
+
+#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -548,6 +552,15 @@
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \
+         CARE_SET_THREAD_ID(INDEX)
+
+#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -753,6 +766,10 @@
 
 #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)
 
+#define CARE_STREAMED_LOOP(RESOURCE, INDEX, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, care_streamed_loop_check)
+
+#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a RAJA loop that uses at least one

diff --git a/src/care/forall.h b/src/care/forall.h
@@ -189,6 +189,49 @@ namespace care {
 #endif
    }
 
+   ////////////////////////////////////////////////////////////////////////////////
+   ///
+   /// @author Neela Kausik
+   ///
+   /// @brief If GPU is available, execute on the device. Otherwise, execute on
+   ///        the host. This specialization is needed for clang-query.
+   ///
+   /// @arg[in] gpu Used to choose this overload of forall
+   /// @arg[in] res Resource provided for execution
+   /// @arg[in] fileName The name of the file where this function is called
+   /// @arg[in] lineNumber The line number in the file where this function is called
+   /// @arg[in] start The starting index (inclusive)
+   /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] body The loop body to execute at each index
+   ///
+   ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CARE_GPUCC)
+   template <typename LB> 
+   void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber,
+               const int start, const int end, LB&& body) {
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = true;
+#endif
+
+#if CARE_ENABLE_GPU_SIMULATION_MODE
+      forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#elif defined(__CUDACC__)
+      forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+#elif defined(__HIPCC__)
+      forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+#else
+      forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#endif
+
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = false;
+#endif   
+	}   
+#endif
+
    ////////////////////////////////////////////////////////////////////////////////
    ///
    /// @author Alan Dayton