diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
new file mode 100644
index 00000000..8b3b3285
--- /dev/null
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -0,0 +1,106 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers.
+// See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////////////
+
+// CARE headers
+#include "care/DefaultMacros.h"
+#include "care/host_device_ptr.h"
+#include "care/forall.h"
+#include "care/policies.h"
+#include "RAJA/RAJA.hpp"
+
+// Other library headers
+#include <benchmark/benchmark.h>
+#include <omp.h>
+
+// Std library headers
+#include <climits>
+#include <cmath>
+
+#define size 1000000
+
+#if defined(CARE_GPUCC)
+//each kernel has a separate stream
+static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
+   int N = state.range(0);
+   care::Resource res_arr[16];
+   RAJA::resources::Event event_arr[16];
+   care::host_device_ptr<int> arrays[16];
+   for(int i = 0; i < N; i++)
+   {
+      res_arr[i] = care::Resource();
+      event_arr[i] = res_arr[i].get_event();
+      arrays[i] = care::host_device_ptr<int>(size, "arr");
+   }
+
+   //warmup kernel
+   CARE_GPU_LOOP(i, 0 , size) {
+      arrays[0][i] = 0;
+   } CARE_GPU_LOOP_END	
+
+   care::gpuDeviceSynchronize(__FILE__, __LINE__);   
+	
+   for (auto _ : state) {
+      //run num kernels
+      omp_set_num_threads(N);
+      #pragma omp parallel for
+      for(int j = 0; j < N; j++)
+      {
+            CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
+            arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j);
+         } CARE_STREAMED_LOOP_END					
+      }
+      care::gpuDeviceSynchronize(__FILE__, __LINE__);
+   }
+
+   for(int i = 0; i < N; i++){
+      arrays[i].free();
+   }
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);
+
+//all kernels on one stream
+static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
+   int N = state.range(0);	
+
+   care::host_device_ptr<int> arrays[16];
+   for(int i = 0; i < N; i++)
+   {
+      arrays[i] = care::host_device_ptr<int>(size, "arr");
+   }
+
+   //warmup kernel
+   CARE_GPU_LOOP(i, 0, size) {
+      arrays[0][i] = 0;
+   } CARE_GPU_LOOP_END
+
+   care::gpuDeviceSynchronize(__FILE__, __LINE__);
+
+   for (auto _ : state) {
+      //run num kernels
+      for(int j = 0; j < N; j++)
+      {
+         CARE_GPU_LOOP(i, 0, size) {
+            arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j);
+         } CARE_GPU_LOOP_END
+      }
+      care::gpuDeviceSynchronize(__FILE__, __LINE__);
+   }
+
+   for(int i = 0; i < N; i++){
+      arrays[i].free();
+   }
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);
+
+#endif
+
+// Run the benchmarks
+BENCHMARK_MAIN();
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 5f798b75..cfde099a 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall
 blt_add_benchmark(NAME BenchmarkForall
                   COMMAND BenchmarkForall)
 
+blt_add_executable(NAME BenchmarkRaja
+                   SOURCES BenchmarkRaja.cpp
+                   DEPENDS_ON ${care_benchmark_depends})
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_SOURCE_DIR}/src)
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_BINARY_DIR}/include)
+
+blt_add_benchmark(NAME BenchmarkRaja
+                  COMMAND BenchmarkRaja)
+
 blt_add_executable(NAME BenchmarkNumeric
                    SOURCES BenchmarkNumeric.cpp
                    DEPENDS_ON ${care_benchmark_depends})
diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h
index 1de5ec51..6e055d25 100644
--- a/src/care/DefaultMacros.h
+++ b/src/care/DefaultMacros.h
@@ -261,6 +261,10 @@
 
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
 
+#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK)
+
+#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -548,6 +552,15 @@
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \
+         CARE_SET_THREAD_ID(INDEX)
+
+#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -753,6 +766,10 @@
 
 #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)
 
+#define CARE_STREAMED_LOOP(RESOURCE, INDEX, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, care_streamed_loop_check)
+
+#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a RAJA loop that uses at least one
diff --git a/src/care/forall.h b/src/care/forall.h
index d45433cc..20036cd4 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -30,22 +30,26 @@ namespace care {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
    static bool s_reverseLoopOrder = false;
 #endif
-
+   
    template <typename T>
    struct ExecutionPolicyToSpace {
       static constexpr const chai::ExecutionSpace value = chai::CPU;
    };
 
 #if defined(__CUDACC__)
+   typedef RAJA::resources::Cuda Resource;
    template <>
    struct ExecutionPolicyToSpace<RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>> {
       static constexpr const chai::ExecutionSpace value = chai::GPU;
    };
 #elif defined (__HIPCC__)
+   typedef RAJA::resources::Hip Resource;
    template <>
    struct ExecutionPolicyToSpace<RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>> {
       static constexpr const chai::ExecutionSpace value = chai::GPU;
    };
+#else
+   typedef RAJA::resources::Host Resource;
 #endif
 
 #if CARE_ENABLE_GPU_SIMULATION_MODE
@@ -97,6 +101,50 @@ namespace care {
       }
    }
 
+      ////////////////////////////////////////////////////////////////////////////////
+   ///
+   /// @author Peter Robinson, Alan Dayton
+   ///
+   /// @brief Loops over the given indices and calls the loop body with each index.
+   ///        This overload is CHAI and RAJA aware and sets the execution space accordingly.
+   ///
+   /// @arg[in] policy Used to choose this overload of forall
+   /// @arg[in] res Resource to be used
+   /// @arg[in] fileName The name of the file where this function is called
+   /// @arg[in] lineNumber The line number in the file where this function is called
+   /// @arg[in] start The starting index (inclusive)
+   /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] body The loop body to execute at each index
+   ///
+   ////////////////////////////////////////////////////////////////////////////////
+   template <typename R, typename ExecutionPolicy, typename LB>
+   void forall(ExecutionPolicy /* policy */, R res, const char * fileName, const int lineNumber,
+               const int start, const int end, LB&& body) {
+      const int length = end - start;
+
+      if (length != 0) {
+         PluginData::setFileName(fileName);
+         PluginData::setLineNumber(lineNumber);
+
+
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+         RAJA::RangeStrideSegment rangeSegment =
+            s_reverseLoopOrder ?
+            RAJA::RangeStrideSegment(end - 1, start - 1, -1) :
+            RAJA::RangeStrideSegment(start, end, 1);
+#else
+         RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(start, end);
+#endif
+
+#if CARE_ENABLE_GPU_SIMULATION_MODE
+         RAJA::forall<RAJA::seq_exec>(res, rangeSegment, std::forward<LB>(body));
+#else
+         RAJA::forall<ExecutionPolicy>(res, rangeSegment, std::forward<LB>(body));
+#endif
+      }
+   }
+
+
    ////////////////////////////////////////////////////////////////////////////////
    ///
    /// @author Alan Dayton
@@ -189,6 +237,49 @@ namespace care {
 #endif
    }
 
+   ////////////////////////////////////////////////////////////////////////////////
+   ///
+   /// @author Neela Kausik
+   ///
+   /// @brief If GPU is available, execute on the device. Otherwise, execute on
+   ///        the host. This specialization is needed for clang-query.
+   ///
+   /// @arg[in] gpu Used to choose this overload of forall
+   /// @arg[in] res Resource provided for execution
+   /// @arg[in] fileName The name of the file where this function is called
+   /// @arg[in] lineNumber The line number in the file where this function is called
+   /// @arg[in] start The starting index (inclusive)
+   /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] body The loop body to execute at each index
+   ///
+   ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CARE_GPUCC)
+   template <typename LB> 
+   void forall_with_stream(gpu, Resource res, const char * fileName, const int lineNumber,
+               const int start, const int end, LB&& body) {
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = true;
+#endif
+
+#if CARE_ENABLE_GPU_SIMULATION_MODE
+      forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#elif defined(__CUDACC__)
+      forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#elif defined(__HIPCC__)
+      forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#else
+      forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#endif
+
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = false;
+#endif   
+	}   
+#endif
+
    ////////////////////////////////////////////////////////////////////////////////
    ///
    /// @author Alan Dayton