diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8091a6df8..0d9a6c6e1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -221,6 +221,12 @@ blt_add_executable(
   stream/TRIAD.cpp
   stream/TRIAD-Seq.cpp
   stream/TRIAD-OMPTarget.cpp
+  stream/TRIAD_PARTED.cpp
+  stream/TRIAD_PARTED-Seq.cpp
+  stream/TRIAD_PARTED-OMPTarget.cpp
+  stream/TRIAD_PARTED_FUSED.cpp
+  stream/TRIAD_PARTED_FUSED-Seq.cpp
+  stream/TRIAD_PARTED_FUSED-OMPTarget.cpp
   common/DataUtils.cpp
   common/Executor.cpp
   common/KernelBase.cpp
diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp
index a9d161183..e82795553 100644
--- a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp
+++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp
@@ -13,6 +13,7 @@
 #if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA)
 
 #include "common/CudaDataUtils.hpp"
+#include "common/MemPool.hpp"
 
 #include <iostream>
 
@@ -21,33 +22,33 @@ namespace rajaperf
 namespace comm
 {
 
-#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA(vid) \
   Real_ptr*   pack_buffer_ptrs; \
   Int_ptr*    pack_list_ptrs; \
   Real_ptr*   pack_var_ptrs; \
   Index_type* pack_len_ptrs; \
-  allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, pack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, pack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, pack_len_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_len_ptrs,    num_neighbors * num_vars); \
   Real_ptr*   unpack_buffer_ptrs; \
   Int_ptr*    unpack_list_ptrs; \
   Real_ptr*   unpack_var_ptrs; \
   Index_type* unpack_len_ptrs; \
-  allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, unpack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, unpack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, unpack_len_ptrs,    num_neighbors * num_vars);
-
-#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \
-  deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \
-  deallocData(DataSpace::CudaPinned, pack_list_ptrs); \
-  deallocData(DataSpace::CudaPinned, pack_var_ptrs); \
-  deallocData(DataSpace::CudaPinned, pack_len_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_len_ptrs);
+  allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_len_ptrs,    num_neighbors * num_vars);
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(vid) \
+  deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_len_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_len_ptrs);
 
 template < size_t block_size >
 __launch_bounds__(block_size)
@@ -99,7 +100,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA;
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA(Base_CUDA);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -198,7 +199,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid)
     }
     stopTimer();
 
-    HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA;
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(Base_CUDA);
 
   } else {
      getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl;
@@ -216,7 +217,8 @@ void HALO_EXCHANGE_FUSED::runCudaVariantWorkGroup(VariantID vid)
 
   if ( vid == RAJA_CUDA ) {
 
-    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::cuda::pinned_mempool_type>;
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
     using Allocator = AllocatorHolder::Allocator<char>;
 
     AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp
index 2ac30479b..e297eede2 100644
--- a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp
+++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp
@@ -13,6 +13,7 @@
 #if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP)
 
 #include "common/HipDataUtils.hpp"
+#include "common/MemPool.hpp"
 
 #include <iostream>
 
@@ -21,33 +22,33 @@ namespace rajaperf
 namespace comm
 {
 
-#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP(vid) \
   Real_ptr*   pack_buffer_ptrs; \
   Int_ptr*    pack_list_ptrs; \
   Real_ptr*   pack_var_ptrs; \
   Index_type* pack_len_ptrs; \
-  allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_len_ptrs,    num_neighbors * num_vars); \
   Real_ptr*   unpack_buffer_ptrs; \
   Int_ptr*    unpack_list_ptrs; \
   Real_ptr*   unpack_var_ptrs; \
   Index_type* unpack_len_ptrs; \
-  allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs,    num_neighbors * num_vars);
-
-#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \
-  deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs);
+  allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_len_ptrs,    num_neighbors * num_vars);
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP(vid) \
+  deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_len_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_len_ptrs);
 
 template < size_t block_size >
 __launch_bounds__(block_size)
@@ -99,7 +100,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP;
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP(Base_HIP);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -198,7 +199,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid)
     }
     stopTimer();
 
-    HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP;
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP(Base_HIP);
 
   } else {
      getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl;
@@ -216,7 +217,8 @@ void HALO_EXCHANGE_FUSED::runHipVariantWorkGroup(VariantID vid)
 
   if ( vid == RAJA_HIP ) {
 
-    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::hip::pinned_mempool_type>;
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_HIP)>>>;
     using Allocator = AllocatorHolder::Allocator<char>;
 
     AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp
index 1af5d4bb9..08dae9d81 100644
--- a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp
+++ b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp
@@ -12,6 +12,8 @@
 
 #if defined(RAJA_PERFSUITE_ENABLE_MPI)
 
+#include "common/MemPool.hpp"
+
 #include <iostream>
 
 namespace rajaperf
@@ -307,7 +309,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid)
     case RAJA_OpenMP : {
 
       using AllocatorHolder = RAJAPoolAllocatorHolder<
-        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMP)>>>;
       using Allocator = AllocatorHolder::Allocator<char>;
 
       AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp
index 18c32437d..0bace7a6e 100644
--- a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp
+++ b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp
@@ -13,6 +13,7 @@
 #if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP)
 
 #include "common/OpenMPTargetDataUtils.hpp"
+#include "common/MemPool.hpp"
 
 #include <iostream>
 
@@ -197,7 +198,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid)
   if ( vid == RAJA_OpenMPTarget ) {
 
     using AllocatorHolder = RAJAPoolAllocatorHolder<
-        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMPTarget)>>>;
     using Allocator = AllocatorHolder::Allocator<char>;
 
     AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp
index bca51de0d..95452b833 100644
--- a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp
+++ b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp
@@ -12,6 +12,8 @@
 
 #if defined(RAJA_PERFSUITE_ENABLE_MPI)
 
+#include "common/MemPool.hpp"
+
 #include <iostream>
 
 namespace rajaperf
@@ -231,7 +233,7 @@ void HALO_EXCHANGE_FUSED::runSeqVariantWorkGroup(VariantID vid)
     case RAJA_Seq : {
 
       using AllocatorHolder = RAJAPoolAllocatorHolder<
-        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_Seq)>>>;
       using Allocator = AllocatorHolder::Allocator<char>;
 
       AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_PACKING_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp
index 7541a30ef..de4cb8252 100644
--- a/src/comm/HALO_PACKING_FUSED-Cuda.cpp
+++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp
@@ -13,6 +13,7 @@
 #if defined(RAJA_ENABLE_CUDA)
 
 #include "common/CudaDataUtils.hpp"
+#include "common/MemPool.hpp"
 
 #include <iostream>
 
@@ -21,33 +22,33 @@ namespace rajaperf
 namespace comm
 {
 
-#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA(vid) \
   Real_ptr*   pack_buffer_ptrs; \
   Int_ptr*    pack_list_ptrs; \
   Real_ptr*   pack_var_ptrs; \
   Index_type* pack_len_ptrs; \
-  allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, pack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, pack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, pack_len_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_len_ptrs,    num_neighbors * num_vars); \
   Real_ptr*   unpack_buffer_ptrs; \
   Int_ptr*    unpack_list_ptrs; \
   Real_ptr*   unpack_var_ptrs; \
   Index_type* unpack_len_ptrs; \
-  allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, unpack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, unpack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::CudaPinned, unpack_len_ptrs,    num_neighbors * num_vars);
-
-#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \
-  deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \
-  deallocData(DataSpace::CudaPinned, pack_list_ptrs); \
-  deallocData(DataSpace::CudaPinned, pack_var_ptrs); \
-  deallocData(DataSpace::CudaPinned, pack_len_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \
-  deallocData(DataSpace::CudaPinned, unpack_len_ptrs);
+  allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_len_ptrs,    num_neighbors * num_vars);
+
+#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(vid) \
+  deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_len_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_len_ptrs);
 
 template < size_t block_size >
 __launch_bounds__(block_size)
@@ -103,7 +104,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA;
+    HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA(Base_CUDA);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -189,7 +190,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid)
     }
     stopTimer();
 
-    HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA;
+    HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(Base_CUDA);
 
   } else {
      getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl;
@@ -207,7 +208,8 @@ void HALO_PACKING_FUSED::runCudaVariantWorkGroup(VariantID vid)
 
   if ( vid == RAJA_CUDA ) {
 
-    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::cuda::pinned_mempool_type>;
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
     using Allocator = AllocatorHolder::Allocator<char>;
 
     AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_PACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp
index 7b4d9b064..43c131144 100644
--- a/src/comm/HALO_PACKING_FUSED-Hip.cpp
+++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp
@@ -13,6 +13,7 @@
 #if defined(RAJA_ENABLE_HIP)
 
 #include "common/HipDataUtils.hpp"
+#include "common/MemPool.hpp"
 
 #include <iostream>
 
@@ -21,33 +22,33 @@ namespace rajaperf
 namespace comm
 {
 
-#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP(vid) \
   Real_ptr*   pack_buffer_ptrs; \
   Int_ptr*    pack_list_ptrs; \
   Real_ptr*   pack_var_ptrs; \
   Index_type* pack_len_ptrs; \
-  allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), pack_len_ptrs,    num_neighbors * num_vars); \
   Real_ptr*   unpack_buffer_ptrs; \
   Int_ptr*    unpack_list_ptrs; \
   Real_ptr*   unpack_var_ptrs; \
   Index_type* unpack_len_ptrs; \
-  allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs,    num_neighbors * num_vars);
-
-#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP \
-  deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \
-  deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs);
+  allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(getFuserDataSpace(vid), unpack_len_ptrs,    num_neighbors * num_vars);
+
+#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP(vid) \
+  deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), pack_len_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \
+  deallocData(getFuserDataSpace(vid), unpack_len_ptrs);
 
 template < size_t block_size >
 __launch_bounds__(block_size)
@@ -103,7 +104,7 @@ void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP;
+    HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP(Base_HIP);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -189,7 +190,7 @@ void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid)
     }
     stopTimer();
 
-    HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP;
+    HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP(Base_HIP);
 
   } else {
      getCout() << "\n HALO_PACKING_FUSED : Unknown Hip variant id = " << vid << std::endl;
@@ -207,7 +208,8 @@ void HALO_PACKING_FUSED::runHipVariantWorkGroup(VariantID vid)
 
   if ( vid == RAJA_HIP ) {
 
-    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::hip::pinned_mempool_type>;
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_HIP)>>>;
     using Allocator = AllocatorHolder::Allocator<char>;
 
     AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_PACKING_FUSED-OMP.cpp b/src/comm/HALO_PACKING_FUSED-OMP.cpp
index 143a65501..dd854cee5 100644
--- a/src/comm/HALO_PACKING_FUSED-OMP.cpp
+++ b/src/comm/HALO_PACKING_FUSED-OMP.cpp
@@ -10,6 +10,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "common/MemPool.hpp"
+
 #include <iostream>
 
 namespace rajaperf
@@ -275,7 +277,7 @@ void HALO_PACKING_FUSED::runOpenMPVariantWorkGroup(VariantID vid)
     case RAJA_OpenMP : {
 
       using AllocatorHolder = RAJAPoolAllocatorHolder<
-        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMP)>>>;
       using Allocator = AllocatorHolder::Allocator<char>;
 
       AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp
index ab0b075b4..8ca1d4ef5 100644
--- a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp
+++ b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp
@@ -13,6 +13,7 @@
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 #include "common/OpenMPTargetDataUtils.hpp"
+#include "common/MemPool.hpp"
 
 #include <iostream>
 
@@ -182,7 +183,7 @@ void HALO_PACKING_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid)
   if ( vid == RAJA_OpenMPTarget ) {
 
     using AllocatorHolder = RAJAPoolAllocatorHolder<
-        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMPTarget)>>>;
     using Allocator = AllocatorHolder::Allocator<char>;
 
     AllocatorHolder allocatorHolder;
diff --git a/src/comm/HALO_PACKING_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp
index 2b25adcd0..ae3d6a6f1 100644
--- a/src/comm/HALO_PACKING_FUSED-Seq.cpp
+++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp
@@ -10,6 +10,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "common/MemPool.hpp"
+
 #include <iostream>
 
 namespace rajaperf
@@ -199,7 +201,7 @@ void HALO_PACKING_FUSED::runSeqVariantWorkGroup(VariantID vid)
     case RAJA_Seq : {
 
       using AllocatorHolder = RAJAPoolAllocatorHolder<
-        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_Seq)>>>;
       using Allocator = AllocatorHolder::Allocator<char>;
 
       AllocatorHolder allocatorHolder;
diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp
index 225d8233f..f1dbefd19 100644
--- a/src/common/DataUtils.hpp
+++ b/src/common/DataUtils.hpp
@@ -403,6 +403,26 @@ inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size
 }
 
 
+/*!
+ * \brief Allocator type for basic_mempool using DataSpaces.
+ */
+template < DataSpace dataSpace >
+struct dataspace_allocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    return detail::allocData(dataSpace, nbytes, alignof(std::max_align_t));
+  }
+
+  // returns true on success, false on failure
+  bool free(void* ptr)
+  {
+    detail::deallocData(dataSpace, ptr);
+    return true;
+  }
+};
+
 /*!
  * \brief Holds a RajaPool object and provides access to it via a
  *        std allocator compliant type.
@@ -497,6 +517,11 @@ struct RAJAPoolAllocatorHolder
     { }
   };
 
+  template < typename ... Ts >
+  RAJAPoolAllocatorHolder(Ts&&... args)
+    : m_pool(std::forward<Ts>(args)...)
+  { }
+
   template < typename T >
   Allocator<T> getAllocator()
   {
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 32f32f64b..df23ca225 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -264,6 +264,46 @@ class KernelBase
   DataSpace getReductionDataSpace(VariantID vid) const;
   DataSpace getMPIDataSpace(VariantID vid) const;
 
+  /*!
+   * \brief Get DataSpace to use with fusers for given variant.
+   */
+  static constexpr DataSpace getFuserDataSpace(VariantID vid)
+  {
+    switch(vid)
+    {
+    case VariantID::Base_Seq:
+    case VariantID::Lambda_Seq:
+    case VariantID::RAJA_Seq:
+      return DataSpace::Host;
+
+    case VariantID::Base_OpenMP:
+    case VariantID::Lambda_OpenMP:
+    case VariantID::RAJA_OpenMP:
+      return DataSpace::Host;
+
+    case VariantID::Base_OpenMPTarget:
+    case VariantID::RAJA_OpenMPTarget:
+      return DataSpace::Host;
+
+    case VariantID::Base_CUDA:
+    case VariantID::Lambda_CUDA:
+    case VariantID::RAJA_CUDA:
+      return DataSpace::CudaPinned;
+
+    case VariantID::Base_HIP:
+    case VariantID::Lambda_HIP:
+    case VariantID::RAJA_HIP:
+      return DataSpace::HipPinnedCoarse;
+
+    case VariantID::Kokkos_Lambda:
+      return DataSpace::Host;
+
+    case VariantID::NumVariants:
+      return DataSpace::Host;
+    }
+    return DataSpace::Host;
+  }
+
   template <typename T>
   void allocData(DataSpace dataSpace, T& ptr, Size_type len)
   {
diff --git a/src/common/MemPool.hpp b/src/common/MemPool.hpp
new file mode 100644
index 000000000..03b2e755a
--- /dev/null
+++ b/src/common/MemPool.hpp
@@ -0,0 +1,450 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// This file is taken from RAJA/util/basic_mempool.hpp
+// and modified to support resources
+//
+
+#ifndef RAJAPerf_BASIC_MEMPOOL_HPP
+#define RAJAPerf_BASIC_MEMPOOL_HPP
+
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <list>
+#include <map>
+
+#include "RAJA/util/align.hpp"
+#include "RAJA/util/mutex.hpp"
+
+namespace rajaperf
+{
+
+namespace basic_mempool
+{
+
+namespace detail
+{
+
+
+/*! \class MemoryArena
+ ******************************************************************************
+ *
+ * \brief  MemoryArena is a map based subclass for class MemPool
+ * provides book-keeping to divy a large chunk of pre-allocated memory to avoid
+ * the overhead of  malloc/free or cudaMalloc/cudaFree, etc
+ *
+ * get/give are the primary calls used by class MemPool to get aligned memory
+ * from the pool or give it back
+ *
+ *
+ ******************************************************************************
+ */
+class MemoryArena
+{
+public:
+  using free_type = std::map<void*, void*>;
+  using free_value_type = typename free_type::value_type;
+  using used_type = std::map<void*, void*>;
+  using used_value_type = typename used_type::value_type;
+
+  MemoryArena(void* ptr, size_t size)
+    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
+      m_free_space(),
+      m_used_space()
+  {
+     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
+    if (m_allocation.begin == nullptr) {
+      fprintf(stderr, "Attempt to create MemoryArena with no memory");
+      std::abort();
+    }
+  }
+
+  MemoryArena(MemoryArena const&) = delete;
+  MemoryArena& operator=(MemoryArena const&) = delete;
+
+  MemoryArena(MemoryArena&&) = default;
+  MemoryArena& operator=(MemoryArena&&) = default;
+
+  size_t capacity()
+  {
+    return static_cast<char*>(m_allocation.end) -
+           static_cast<char*>(m_allocation.begin);
+  }
+
+  bool unused() { return m_used_space.empty(); }
+
+  void* get_allocation() { return m_allocation.begin; }
+
+  void* get(size_t nbytes, size_t alignment)
+  {
+    void* ptr_out = nullptr;
+    if (capacity() >= nbytes) {
+      free_type::iterator end = m_free_space.end();
+      for (free_type::iterator iter = m_free_space.begin(); iter != end;
+           ++iter) {
+
+        void* adj_ptr = iter->first;
+        size_t cap =
+            static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
+
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
+
+          ptr_out = adj_ptr;
+
+          remove_free_chunk(iter,
+                            adj_ptr,
+                            static_cast<char*>(adj_ptr) + nbytes);
+
+          add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
+
+          break;
+        }
+      }
+    }
+    return ptr_out;
+  }
+
+  bool give(void* ptr)
+  {
+    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
+
+      used_type::iterator found = m_used_space.find(ptr);
+
+      if (found != m_used_space.end()) {
+
+        add_free_chunk(found->first, found->second);
+
+        m_used_space.erase(found);
+
+      } else {
+        fprintf(stderr, "Invalid free %p", ptr);
+        std::abort();
+      }
+
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+private:
+  struct memory_chunk {
+    void* begin;
+    void* end;
+  };
+
+  void add_free_chunk(void* begin, void* end)
+  {
+    // integrates a chunk of memory into free_space
+    free_type::iterator invl = m_free_space.end();
+    free_type::iterator next = m_free_space.lower_bound(begin);
+
+    // check if prev exists
+    if (next != m_free_space.begin()) {
+      // check if prev can cover [begin, end)
+      free_type::iterator prev = next;
+      --prev;
+      if (prev->second == begin) {
+        // extend prev to cover [begin, end)
+        prev->second = end;
+
+        // check if prev can cover next too
+        if (next != invl) {
+          assert(next->first != begin);
+
+          if (next->first == end) {
+            // extend prev to cover next too
+            prev->second = next->second;
+
+            // remove redundant next
+            m_free_space.erase(next);
+          }
+        }
+        return;
+      }
+    }
+
+    if (next != invl) {
+      assert(next->first != begin);
+
+      if (next->first == end) {
+        // extend next to cover [begin, end)
+        m_free_space.insert(next, free_value_type{begin, next->second});
+        m_free_space.erase(next);
+
+        return;
+      }
+    }
+
+    // no free space adjacent to this chunk, add seperate free chunk [begin,
+    // end)
+    m_free_space.insert(next, free_value_type{begin, end});
+  }
+
+  void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
+  {
+
+    void* ptr = iter->first;
+    void* ptr_end = iter->second;
+
+    // fixup m_free_space, shrinking and adding chunks as needed
+    if (ptr != begin) {
+
+      // shrink end of current free region to [ptr, begin)
+      iter->second = begin;
+
+      if (end != ptr_end) {
+
+        // insert free region [end, ptr_end) after current free region
+        free_type::iterator next = iter;
+        ++next;
+        m_free_space.insert(next, free_value_type{end, ptr_end});
+      }
+
+    } else if (end != ptr_end) {
+
+      // shrink beginning of current free region to [end, ptr_end)
+      free_type::iterator next = iter;
+      ++next;
+      m_free_space.insert(next, free_value_type{end, ptr_end});
+      m_free_space.erase(iter);
+
+    } else {
+
+      // can not reuse current region, erase
+      m_free_space.erase(iter);
+    }
+  }
+
+  void add_used_chunk(void* begin, void* end)
+  {
+    // simply inserts a chunk of memory into used_space
+    m_used_space.insert(used_value_type{begin, end});
+  }
+
+  memory_chunk m_allocation;
+  free_type m_free_space;
+  used_type m_used_space;
+};
+
+} /* end namespace detail */
+
+
+template <typename allocator_t>
+using MemPool = ::RAJA::basic_mempool::MemPool<allocator_t>;
+
+
+/*! \class LaggedMemPool
+ ******************************************************************************
+ *
+ * \brief  LaggedMemPool pre-allocates a large chunk of memory and provides generic
+ * malloc/free for the user to allocate aligned data within the pool
+ *
+ * LaggedMemPool uses MemoryArena to do the heavy lifting of maintaining access to
+ * the used/free space.
+ *
+ * LaggedMemPool provides an example generic_allocator which can guide more
+ *specialized
+ * allocators. The following are some examples
+ *
+ * using device_mempool_type = basic_mempool::LaggedMemPool<cuda::DeviceAllocator>;
+ * using device_zeroed_mempool_type =
+ *basic_mempool::LaggedMemPool<cuda::DeviceZeroedAllocator>;
+ * using pinned_mempool_type = basic_mempool::LaggedMemPool<cuda::PinnedAllocator>;
+ *
+ * The user provides the specialized allocator, for example :
+ * struct DeviceAllocator {
+ *
+ *  // returns a valid pointer on success, nullptr on failure
+ *  void* malloc(size_t nbytes)
+ *  {
+ *    void* ptr;
+ *    cudaErrchk(cudaMalloc(&ptr, nbytes));
+ *    return ptr;
+ *  }
+ *
+ *  // returns true on success, false on failure
+ *  bool free(void* ptr)
+ *  {
+ *    cudaErrchk(cudaFree(ptr));
+ *    return true;
+ *  }
+ * };
+ *
+ *
+ ******************************************************************************
+ */
+template <typename allocator_t, typename lagged_res>
+class LaggedMemPool
+{
+public:
+  using allocator_type = allocator_t;
+
+  static inline LaggedMemPool<allocator_t, lagged_res>& getInstance()
+  {
+    static LaggedMemPool<allocator_t, lagged_res> pool{};
+    return pool;
+  }
+
+  static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull;
+
+  LaggedMemPool(size_t default_arena_size = default_default_arena_size,
+          lagged_res const& lag_res = lagged_res::get_default())
+      : m_arenas(), m_default_arena_size(default_arena_size),
+        m_alloc(), m_lagged_frees(), m_lag_res(lag_res)
+  {
+  }
+
+  ~LaggedMemPool()
+  {
+    // This is here for the case that LaggedMemPool is used as a static object.
+    // If allocator_t uses cudaFree then it will fail with
+    // cudaErrorCudartUnloading when called after main.
+  }
+
+
+  void free_chunks()
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::lock_guard<RAJA::omp::mutex> lock(m_mutex);
+#endif
+
+    while (!m_arenas.empty()) {
+      void* allocation_ptr = m_arenas.front().get_allocation();
+      m_alloc.free(allocation_ptr);
+      m_arenas.pop_front();
+    }
+  }
+
+  size_t arena_size()
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::lock_guard<RAJA::omp::mutex> lock(m_mutex);
+#endif
+
+    return m_default_arena_size;
+  }
+
+  size_t arena_size(size_t new_size)
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::lock_guard<RAJA::omp::mutex> lock(m_mutex);
+#endif
+
+    size_t prev_size = m_default_arena_size;
+    m_default_arena_size = new_size;
+    return prev_size;
+  }
+
+  template <typename T>
+  T* malloc(size_t nTs, size_t alignment = alignof(T))
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::lock_guard<RAJA::omp::mutex> lock(m_mutex);
+#endif
+
+    auto get_from_existing_arena = [&](size_t size, size_t alignment) {
+      void* ptr = nullptr;
+      for (detail::MemoryArena& arena : m_arenas) {
+        ptr = arena.get(size, alignment);
+        if (ptr != nullptr) {
+          break;
+        }
+      }
+      return ptr;
+    };
+
+    auto get_from_new_arena = [&](size_t size, size_t alignment) {
+      void* ptr = nullptr;
+      const size_t alloc_size =
+          std::max(size + alignment, m_default_arena_size);
+      void* arena_ptr = m_alloc.malloc(alloc_size);
+      if (arena_ptr != nullptr) {
+        m_arenas.emplace_front(arena_ptr, alloc_size);
+        ptr = m_arenas.front().get(size, alignment);
+      }
+      return ptr;
+    };
+
+    const size_t size = nTs * sizeof(T);
+
+    void* ptr = get_from_existing_arena(size, alignment);
+
+    if (ptr == nullptr) {
+      free_lagged_memory_impl();
+      ptr = get_from_existing_arena(size, alignment);
+    }
+
+    if (ptr == nullptr) {
+      ptr = get_from_new_arena(size, alignment);
+    }
+
+    return static_cast<T*>(ptr);
+  }
+
+  void free(const void* cptr)
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::lock_guard<RAJA::omp::mutex> lock(m_mutex);
+#endif
+
+    m_lagged_frees.emplace_back(const_cast<void*>(cptr));
+  }
+
+  void free_lagged_memory()
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::lock_guard<RAJA::omp::mutex> lock(m_mutex);
+#endif
+
+    free_lagged_memory_impl();
+  }
+
+private:
+  using arena_container_type = std::list<detail::MemoryArena>;
+
+#if defined(RAJA_ENABLE_OPENMP)
+  RAJA::omp::mutex m_mutex;
+#endif
+
+  arena_container_type m_arenas;
+  size_t m_default_arena_size;
+  allocator_t m_alloc;
+  std::vector<void*> m_lagged_frees;
+  lagged_res m_lag_res;
+
+
+  void free_lagged_memory_impl()
+  {
+    if (!m_lagged_frees.empty()) {
+      m_lag_res.wait();
+      for (void* ptr : m_lagged_frees) {
+        for (detail::MemoryArena& arena : m_arenas) {
+          if (arena.give(ptr)) {
+            ptr = nullptr;
+            break;
+          }
+        }
+        if (ptr != nullptr) {
+          fprintf(stderr, "Unknown pointer %p", ptr);
+        }
+      }
+      m_lagged_frees.clear();
+    }
+  }
+};
+
+} /* end namespace basic_mempool */
+
+} /* end namespace rajaperf */
+
+
+#endif /* RAJAPerf_BASIC_MEMPOOL_HPP */
diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp
index dc9d0e20b..a6e3ca69b 100644
--- a/src/common/RAJAPerfSuite.cpp
+++ b/src/common/RAJAPerfSuite.cpp
@@ -75,6 +75,8 @@
 #include "stream/MUL.hpp"
 #include "stream/ADD.hpp"
 #include "stream/TRIAD.hpp"
+#include "stream/TRIAD_PARTED.hpp"
+#include "stream/TRIAD_PARTED_FUSED.hpp"
 #include "stream/DOT.hpp"
 
 //
@@ -226,6 +228,8 @@ static const std::string KernelNames [] =
   std::string("Stream_DOT"),
   std::string("Stream_MUL"),
   std::string("Stream_TRIAD"),
+  std::string("Stream_TRIAD_PARTED"),
+  std::string("Stream_TRIAD_PARTED_FUSED"),
 
 //
 // Apps kernels...
@@ -897,6 +901,14 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new stream::TRIAD(run_params);
        break;
     }
+    case Stream_TRIAD_PARTED : {
+       kernel = new stream::TRIAD_PARTED(run_params);
+       break;
+    }
+    case Stream_TRIAD_PARTED_FUSED : {
+       kernel = new stream::TRIAD_PARTED_FUSED(run_params);
+       break;
+    }
 
 //
 // Apps kernels...
diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp
index a112a44d1..624f37aa6 100644
--- a/src/common/RAJAPerfSuite.hpp
+++ b/src/common/RAJAPerfSuite.hpp
@@ -136,6 +136,8 @@ enum KernelID {
   Stream_DOT,
   Stream_MUL,
   Stream_TRIAD,
+  Stream_TRIAD_PARTED,
+  Stream_TRIAD_PARTED_FUSED,
 
 //
 // Apps kernels...
diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp
index 061e143cf..fb95769f8 100644
--- a/src/common/RunParams.cpp
+++ b/src/common/RunParams.cpp
@@ -12,8 +12,13 @@
 
 #include <cstdlib>
 #include <cstdio>
+#include <cmath>
 #include <iostream>
+#include <random>
+#include <algorithm>
+#include <functional>
 
+#include <vector>
 #include <list>
 #include <set>
 
@@ -38,6 +43,9 @@ RunParams::RunParams(int argc, char** argv)
    size(0.0),
    size_factor(0.0),
    data_alignment(RAJA::DATA_ALIGN),
+   num_parts(10),
+   part_type(PartType::Even),
+   part_size_order(PartSizeOrder::Random),
    gpu_stream(1),
    gpu_block_sizes(),
    mpi_size(1),
@@ -91,6 +99,142 @@ RunParams::~RunParams()
 }
 
 
+
+/*
+ *******************************************************************************
+ *
+ * Reorder partition boundaries based on params while preserving sizes.
+ *
+ *******************************************************************************
+ */
+void RunParams::reorderPartitionSizes(std::vector<Index_type>& parts) const
+{
+  std::vector<Index_type> size_of_parts;
+
+  size_of_parts.reserve(num_parts);
+
+  for (size_t p = 1; p < parts.size(); ++p) {
+    size_of_parts.emplace_back(parts[p] - parts[p-1]);
+  }
+
+  switch ( part_size_order ) {
+    case PartSizeOrder::Random:
+    {
+      std::mt19937 rng(parts.size()); // seed consistently
+      std::shuffle(size_of_parts.begin(), size_of_parts.end(), rng);
+    } break;
+    case PartSizeOrder::Ascending:
+    {
+      std::sort(size_of_parts.begin(), size_of_parts.end(), std::less<Index_type>{});
+    } break;
+    case PartSizeOrder::Descending:
+    {
+      std::sort(size_of_parts.begin(), size_of_parts.end(), std::greater<Index_type>{});
+    } break;
+    default:
+    {
+      getCout() << "RunParams::reorderPartitionSizes: unknown part_size_order" << std::endl;
+    } break;
+  }
+
+  for (size_t p = 1; p < parts.size(); ++p) {
+    parts[p] = parts[p-1] + size_of_parts[p-1];
+  }
+}
+
+/*
+ *******************************************************************************
+ *
+ * Get a partition boundaries based on params.
+ *
+ *******************************************************************************
+ */
+std::vector<Index_type> RunParams::getPartition(Index_type len, Index_type num_parts) const
+{
+  std::vector<Index_type> parts;
+
+  parts.reserve(num_parts+1);
+
+  parts.emplace_back(0);
+
+  switch ( (len > num_parts && num_parts > 1)
+           ? part_type : PartType::Even ) {
+    case PartType::Even:
+    {
+      for (Index_type p = 1; p < num_parts; ++p) {
+
+        parts.emplace_back((len/num_parts)*p +
+                           (len%num_parts)*p / num_parts);
+      }
+    } break;
+
+    case PartType::Geometric:
+    {
+      auto geo_sum = [](double a, double r, double n) {
+        // sum of geometric series
+        // for i in [0, n), a*pow(r, i)
+        return a * (1.0 - std::pow(r, n)) / (1.0 - r);
+      };
+
+      auto geo_solve_for_r = [&](double sum, double a, double n)
+      {
+        double max_r = std::pow(sum/a, 1.0 / (n-1.0));
+        double min_r = 1.0;
+
+        double r = (max_r + min_r) / 2.0;
+        double diff = geo_sum(a, r, n) - sum;
+
+        constexpr double tolerance = 1.0;
+        constexpr size_t max_iter = 1000;
+
+        // use bisection to find r
+        for (size_t iter = 0;
+             iter < max_iter && (diff < 0.0 || diff > tolerance);
+             ++iter) {
+
+          if (diff > 0.0) {
+            max_r = r;
+          } else {
+            min_r = r;
+          }
+
+          r = (max_r + min_r) / 2.0;
+          diff = geo_sum(a, r, n) - sum;
+        }
+
+        return r;
+      };
+
+      constexpr double a = 1.0;
+      double r = geo_solve_for_r(len, a, num_parts);
+
+      for (Index_type p = 1; p < num_parts; ++p) {
+
+        Index_type val = static_cast<Index_type>(std::floor(geo_sum(a, r, p)));
+
+        if (val > 0 && val < len) {
+          parts.emplace_back(val);
+        } else {
+          getCout() << "RunParams::getPartition: Geometric failed to generate partition" << std::endl;
+          break;
+        }
+      }
+
+    } break;
+    default:
+    {
+      getCout() << "RunParams::getPartition: unknown part_type" << std::endl;
+    } break;
+  }
+
+  parts.emplace_back(len);
+
+  reorderPartitionSizes(parts);
+
+  return parts;
+}
+
+
 /*
  *******************************************************************************
  *
@@ -119,6 +263,9 @@ void RunParams::print(std::ostream& str) const
   str << "\n size = " << size;
   str << "\n size_factor = " << size_factor;
   str << "\n data_alignment = " << data_alignment;
+  str << "\n num_parts = " << num_parts;
+  str << "\n part_type = " << PartTypeToStr(part_type);
+  str << "\n part_size_order = " << PartSizeOrderToStr(part_size_order);
   str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default");
   str << "\n gpu_block_sizes = ";
   for (size_t j = 0; j < gpu_block_sizes.size(); ++j) {
@@ -433,6 +580,78 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         input_state = BadInput;
       }
 
+    } else if ( opt == std::string("--num_parts") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num_parts_arg = ::atoll( argv[i] );
+        if ( num_parts_arg < 1 ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << 1
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          num_parts = num_parts_arg;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--part_type") ) {
+
+      bool got_someting = false;
+      i++;
+      if ( i < argc ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+        } else {
+          for (int ipt = 0; ipt < static_cast<int>(PartType::NumPartTypes); ++ipt) {
+            PartType pt = static_cast<PartType>(ipt);
+            if (PartTypeToStr(pt) == opt) {
+              got_someting = true;
+              part_type = pt;
+              break;
+            }
+          }
+          if (!got_someting) {
+            getCout() << "\nBad input:"
+                      << " must give a valid partition type"
+                      << std::endl;
+            input_state = BadInput;
+          }
+        }
+      }
+
+    } else if ( opt == std::string("--part_size_order") ) {
+
+      bool got_someting = false;
+      i++;
+      if ( i < argc ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+        } else {
+          for (int ipso = 0; ipso < static_cast<int>(PartSizeOrder::NumPartSizeOrders); ++ipso) {
+            PartSizeOrder pso = static_cast<PartSizeOrder>(ipso);
+            if (PartSizeOrderToStr(pso) == opt) {
+              got_someting = true;
+              part_size_order = pso;
+              break;
+            }
+          }
+          if (!got_someting) {
+            getCout() << "\nBad input:"
+                      << " must give a valid partition size order"
+                      << std::endl;
+            input_state = BadInput;
+          }
+        }
+      }
+
     } else if ( opt == std::string("--gpu_stream_0") ) {
 
       gpu_stream = 0;
@@ -1110,7 +1329,7 @@ void RunParams::printHelpMessage(std::ostream& str) const
       << "\t\t -et default library (exclude default and library tunings)\n\n";
 
   str << "\t Options for selecting kernel data used in kernels....\n"
-      << "\t ======================================================\n\n";;
+      << "\t ======================================================\n\n";
 
   str << "\t --data_alignment, -align <int> [default is RAJA::DATA_ALIGN]\n"
       << "\t      (minimum memory alignment for host allocations)\n"
@@ -1118,6 +1337,24 @@ void RunParams::printHelpMessage(std::ostream& str) const
   str << "\t\t Example...\n"
       << "\t\t -align 4096 (allocates memory aligned to 4KiB boundaries)\n\n";
 
+  str << "\t --num_parts <int> [default is 10]\n"
+      << "\t      (number of parts for *_PARTED kernels)\n"
+      << "\t      Must be at least 1.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --num_parts 100 (breaks *_PARTED kernels into 100 loops)\n\n";
+
+  str << "\t --part_type <string> [default is Even]\n"
+      << "\t      (distribution for parts in *_PARTED kernels).\n"
+      << "\t      Valid partition types are 'Even' and 'Geometric'\n";
+  str << "\t\t Example...\n"
+      << "\t\t --part_type Geometric (makes partitions with a fixed ratio of sizes)\n\n";
+
+  str << "\t --part_size_order <string> [default is Random]\n"
+      << "\t      (way to order partition sizes).\n"
+      << "\t      Valid partition size orders are 'Random', 'Ascending', and 'Descending'\n";
+  str << "\t\t Example...\n"
+      << "\t\t --part_size_order Ascending (sort partition sizes in ascending order)\n\n";
+
   str << "\t --seq-data-space, -sds <string> [Default is Host]\n"
       << "\t      (name of data space to use for sequential variants)\n"
       << "\t      Valid data space names are 'Host' or 'CudaPinned'\n";
diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp
index 10ae761a0..dd3cb80ae 100644
--- a/src/common/RunParams.hpp
+++ b/src/common/RunParams.hpp
@@ -97,6 +97,67 @@ class RunParams {
     }
   }
 
+  /*!
+   * \brief Enumeration indicating how to separate partitions
+   */
+  enum PartType {
+    Even,      /*!< all parts use same size */
+    Geometric, /*!< part sizes are multiples of a single factor */
+
+    NumPartTypes
+  };
+
+  /*!
+   * \brief Translate PartType enum value to string
+   */
+  static std::string PartTypeToStr(PartType pt)
+  {
+    switch (pt) {
+      case PartType::Even:
+        return "Even";
+      case PartType::Geometric:
+        return "Geometric";
+      case PartType::NumPartTypes:
+      default:
+        return "Unknown";
+    }
+  }
+
+  /*!
+   * \brief Enumeration indicating how to separate partitions
+   */
+  enum PartSizeOrder {
+    Random,     /*!< part sizes are ordered randomly (but consistently) */
+    Ascending,  /*!< part sizes are in ascending order */
+    Descending, /*!< part sizes are in descending order */
+
+    NumPartSizeOrders
+  };
+
+  /*!
+   * \brief Translate PartSizeOrder enum value to string
+   */
+  static std::string PartSizeOrderToStr(PartSizeOrder pt)
+  {
+    switch (pt) {
+      case PartSizeOrder::Random:
+        return "Random";
+      case PartSizeOrder::Ascending:
+        return "Ascending";
+      case PartSizeOrder::Descending:
+        return "Descending";
+      case PartSizeOrder::NumPartSizeOrders:
+      default:
+        return "Unknown";
+    }
+  }
+
+  /*!
+   * \brief Get a partition from a length, number of partitions, and PartType enum value
+   * Note that the vector will be of length (num_part+1)
+   */
+  std::vector<Index_type> getPartition(Index_type len, Index_type num_parts) const;
+
   /*!
    * \brief Return state of input parsed to this point.
    */
@@ -123,6 +184,8 @@ class RunParams {
 
   Size_type getDataAlignment() const { return data_alignment; }
 
+  Index_type getNumParts() const { return num_parts; }
+
   int getGPUStream() const { return gpu_stream; }
   size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); }
   bool validGPUBlockSize(size_t block_size) const
@@ -199,6 +262,11 @@ class RunParams {
 private:
   RunParams() = delete;
 
+  /*!
+   * \brief Reorder partition sizes, used in getPartition.
+   */
+  void reorderPartitionSizes(std::vector<Index_type>& parts) const;
+
 //@{
 //! @name Routines used in command line parsing and printing option output
   void parseCommandLineOptions(int argc, char** argv);
@@ -231,9 +299,13 @@ class RunParams {
 
   SizeMeaning size_meaning; /*!< meaning of size value */
   double size;           /*!< kernel size to run (input option) */
-  double size_factor;    /*!< default kernel size multipier (input option) */
+  double size_factor;    /*!< default kernel size multiplier (input option) */
   Size_type data_alignment;
 
+  Index_type num_parts;   /*!< number of parts used in parted kernels (input option) */
+  PartType part_type;     /*!< how the partition sizes are generated (input option) */
+  PartSizeOrder part_size_order; /*!< how the partition sizes are ordered (input option) */
+
   int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */
   std::vector<size_t> gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */
   int mpi_size;           /*!< Number of MPI ranks */
diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt
index b730791fc..c1f8241ae 100644
--- a/src/stream/CMakeLists.txt
+++ b/src/stream/CMakeLists.txt
@@ -38,5 +38,17 @@ blt_add_library(
           TRIAD-Cuda.cpp 
           TRIAD-OMPTarget.cpp 
           TRIAD-OMP.cpp 
+          TRIAD_PARTED.cpp
+          TRIAD_PARTED-Seq.cpp
+          TRIAD_PARTED-Hip.cpp
+          TRIAD_PARTED-Cuda.cpp
+          TRIAD_PARTED-OMPTarget.cpp
+          TRIAD_PARTED-OMP.cpp
+          TRIAD_PARTED_FUSED.cpp
+          TRIAD_PARTED_FUSED-Seq.cpp
+          TRIAD_PARTED_FUSED-Hip.cpp
+          TRIAD_PARTED_FUSED-Cuda.cpp
+          TRIAD_PARTED_FUSED-OMPTarget.cpp
+          TRIAD_PARTED_FUSED-OMP.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/stream/TRIAD_PARTED-Cuda.cpp b/src/stream/TRIAD_PARTED-Cuda.cpp
new file mode 100644
index 000000000..c6d2513b8
--- /dev/null
+++ b/src/stream/TRIAD_PARTED-Cuda.cpp
@@ -0,0 +1,580 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha,
+                      Index_type ibegin, Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x + ibegin;
+  if (i < iend) {
+    TRIAD_PARTED_BODY;
+  }
+}
+
+
+template < size_t block_size >
+void TRIAD_PARTED::runCudaVariantBlock(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        triad_parted<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( a, b, c, alpha,
+                                          ibegin, iend );
+        cudaErrchk( cudaGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
+          ibegin, iend, [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+        cudaErrchk( cudaGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED::runCudaVariantStream(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Cuda> res;
+  res.reserve(parts.size());
+  res.emplace_back(getCudaResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        triad_parted<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>( a, b, c, alpha,
+                                          ibegin, iend );
+        cudaErrchk( cudaGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>(
+          ibegin, iend, [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+        cudaErrchk( cudaGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED::runCudaVariantStreamOpenmp(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Cuda> res;
+  res.reserve(parts.size());
+  res.emplace_back(getCudaResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        triad_parted<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>( a, b, c, alpha,
+                                          ibegin, iend );
+        cudaErrchk( cudaGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>(
+          ibegin, iend, [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+        cudaErrchk( cudaGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+template < size_t block_size >
+void TRIAD_PARTED::runCudaVariantStreamEvent(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Cuda> res;
+  res.reserve(parts.size());
+  res.emplace_back(getCudaResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  std::vector<cudaEvent_t> events(parts.size(), cudaEvent_t{});
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    cudaErrchk( cudaEventCreateWithFlags( &events[p], cudaEventDisableTiming ) );
+  }
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) );
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        triad_parted<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>( a, b, c, alpha,
+                                          ibegin, iend );
+        cudaErrchk( cudaGetLastError() );
+
+        cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) );
+        cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) );
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>(
+          ibegin, iend, [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+        cudaErrchk( cudaGetLastError() );
+
+        cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) );
+        cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      camp::resources::Event e0 = res[0].get_event_erased();
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        res[p].wait_for(&e0);
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        camp::resources::Event ep = RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+
+        res[0].wait_for(&ep);
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    cudaErrchk( cudaEventDestroy( events[p] ) );
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED::runCudaVariantStreamEventOpenmp(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Cuda> res;
+  res.reserve(parts.size());
+  res.emplace_back(getCudaResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  std::vector<cudaEvent_t> events(parts.size(), cudaEvent_t{});
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    cudaErrchk( cudaEventCreateWithFlags( &events[p], cudaEventDisableTiming ) );
+  }
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) );
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        triad_parted<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>( a, b, c, alpha,
+                                          ibegin, iend );
+        cudaErrchk( cudaGetLastError() );
+
+        cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) );
+        cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) );
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res[p].get_stream()>>>(
+          ibegin, iend, [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+        cudaErrchk( cudaGetLastError() );
+
+        cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) );
+        cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      camp::resources::Event e0 = res[0].get_event_erased();
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        res[p].wait_for(&e0);
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        camp::resources::Event ep = RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+
+        res[0].wait_for(&ep);
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    cudaErrchk( cudaEventDestroy( events[p] ) );
+  }
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+void TRIAD_PARTED::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+    if (run_params.numValidGPUBlockSize() == 0u ||
+        run_params.validGPUBlockSize(block_size)) {
+
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runCudaVariantBlock<block_size>(vid);
+
+      }
+
+      t += 1;
+
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runCudaVariantStream<block_size>(vid);
+
+      }
+
+      t += 1;
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runCudaVariantStreamOpenmp<block_size>(vid);
+
+      }
+
+      t += 1;
+#endif
+
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runCudaVariantStreamEvent<block_size>(vid);
+
+      }
+
+      t += 1;
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runCudaVariantStreamEventOpenmp<block_size>(vid);
+
+      }
+
+      t += 1;
+#endif
+
+    }
+
+  });
+}
+
+void TRIAD_PARTED::setCudaTuningDefinitions(VariantID vid)
+{
+  seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+    if (run_params.numValidGPUBlockSize() == 0u ||
+        run_params.validGPUBlockSize(block_size)) {
+
+      addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+      addVariantTuningName(vid, "stream_"+std::to_string(block_size));
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      addVariantTuningName(vid, "stream_omp_"+std::to_string(block_size));
+#endif
+
+      addVariantTuningName(vid, "stream_event_"+std::to_string(block_size));
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      addVariantTuningName(vid, "stream_event_omp_"+std::to_string(block_size));
+#endif
+
+    }
+
+  });
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/stream/TRIAD_PARTED-Hip.cpp b/src/stream/TRIAD_PARTED-Hip.cpp
new file mode 100644
index 000000000..e8e3ffa05
--- /dev/null
+++ b/src/stream/TRIAD_PARTED-Hip.cpp
@@ -0,0 +1,592 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha,
+                      Index_type ibegin, Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x + ibegin;
+  if (i < iend) {
+    TRIAD_PARTED_BODY;
+  }
+}
+
+
+template < size_t block_size >
+void TRIAD_PARTED::runHipVariantBlock(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((triad_parted<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  a, b, c, alpha,
+                                          ibegin, iend );
+        hipErrchk( hipGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        auto triad_parted_lambda = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        };
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_parted_lambda)>),
+          grid_size, block_size, shmem, res.get_stream(), ibegin, iend, triad_parted_lambda);
+        hipErrchk( hipGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED::runHipVariantStream(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Hip> res;
+  res.reserve(parts.size());
+  res.emplace_back(getHipResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((triad_parted<block_size>), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(),  a, b, c, alpha,
+                                          ibegin, iend );
+        hipErrchk( hipGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        auto triad_parted_lambda = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        };
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_parted_lambda)>),
+          grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda);
+        hipErrchk( hipGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED::runHipVariantStreamOpenmp(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Hip> res;
+  res.reserve(parts.size());
+  res.emplace_back(getHipResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((triad_parted<block_size>), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(),  a, b, c, alpha,
+                                          ibegin, iend );
+        hipErrchk( hipGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        auto triad_parted_lambda = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        };
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_parted_lambda)>),
+          grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda);
+        hipErrchk( hipGetLastError() );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl;
+  }
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+
+
+template < size_t block_size >
+void TRIAD_PARTED::runHipVariantStreamEvent(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Hip> res;
+  res.reserve(parts.size());
+  res.emplace_back(getHipResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  std::vector<hipEvent_t> events(parts.size(), hipEvent_t{});
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    hipErrchk( hipEventCreateWithFlags( &events[p], hipEventDisableTiming ) );
+  }
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) );
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((triad_parted<block_size>), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(),  a, b, c, alpha,
+                                          ibegin, iend );
+        hipErrchk( hipGetLastError() );
+
+        hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) );
+        hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) );
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        auto triad_parted_lambda = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        };
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_parted_lambda)>),
+          grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda);
+        hipErrchk( hipGetLastError() );
+
+        hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) );
+        hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      camp::resources::Event e0 = res[0].get_event_erased();
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        res[p].wait_for(&e0);
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        camp::resources::Event ep = RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+
+        res[0].wait_for(&ep);
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    hipErrchk( hipEventDestroy( events[p] ) );
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED::runHipVariantStreamEventOpenmp(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  std::vector<camp::resources::Hip> res;
+  res.reserve(parts.size());
+  res.emplace_back(getHipResource());
+  for (size_t p = 1; p < parts.size(); ++p ) {
+    res.emplace_back(p-1);
+  }
+
+  std::vector<hipEvent_t> events(parts.size(), hipEvent_t{});
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    hipErrchk( hipEventCreateWithFlags( &events[p], hipEventDisableTiming ) );
+  }
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) );
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((triad_parted<block_size>), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(),  a, b, c, alpha,
+                                          ibegin, iend );
+        hipErrchk( hipGetLastError() );
+
+        hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) );
+        hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == Lambda_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) );
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) );
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        auto triad_parted_lambda = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        };
+
+        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+        constexpr size_t shmem = 0;
+        hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_parted_lambda)>),
+          grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda);
+        hipErrchk( hipGetLastError() );
+
+        hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) );
+        hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) );
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      camp::resources::Event e0 = res[0].get_event_erased();
+
+      #pragma omp parallel for default(shared)
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        res[p].wait_for(&e0);
+
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        camp::resources::Event ep = RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res[p],
+          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+
+        res[0].wait_for(&ep);
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+  for (size_t p = 0; p < parts.size(); ++p ) {
+    hipErrchk( hipEventDestroy( events[p] ) );
+  }
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+void TRIAD_PARTED::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+    if (run_params.numValidGPUBlockSize() == 0u ||
+        run_params.validGPUBlockSize(block_size)) {
+
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runHipVariantBlock<block_size>(vid);
+
+      }
+
+      t += 1;
+
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runHipVariantStream<block_size>(vid);
+
+      }
+
+      t += 1;
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runHipVariantStreamOpenmp<block_size>(vid);
+
+      }
+
+      t += 1;
+#endif
+
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runHipVariantStreamEvent<block_size>(vid);
+
+      }
+
+      t += 1;
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      if (tune_idx == t) {
+
+        setBlockSize(block_size);
+        runHipVariantStreamEventOpenmp<block_size>(vid);
+
+      }
+
+      t += 1;
+#endif
+
+    }
+
+  });
+}
+
+void TRIAD_PARTED::setHipTuningDefinitions(VariantID vid)
+{
+  seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+    if (run_params.numValidGPUBlockSize() == 0u ||
+        run_params.validGPUBlockSize(block_size)) {
+
+      addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+      addVariantTuningName(vid, "stream_"+std::to_string(block_size));
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      addVariantTuningName(vid, "stream_omp_"+std::to_string(block_size));
+#endif
+
+      addVariantTuningName(vid, "stream_event_"+std::to_string(block_size));
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      addVariantTuningName(vid, "stream_event_omp_"+std::to_string(block_size));
+#endif
+
+    }
+
+  });
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/stream/TRIAD_PARTED-OMP.cpp b/src/stream/TRIAD_PARTED-OMP.cpp
new file mode 100644
index 000000000..2366ca3e4
--- /dev/null
+++ b/src/stream/TRIAD_PARTED-OMP.cpp
@@ -0,0 +1,108 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+
+void TRIAD_PARTED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  auto triad_parted_lam = [=](Index_type i) {
+                     TRIAD_PARTED_BODY;
+                   };
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          #pragma omp parallel for
+          for (Index_type i = ibegin; i < iend; ++i ) {
+            TRIAD_PARTED_BODY;
+          }
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          #pragma omp parallel for
+          for (Index_type i = ibegin; i < iend; ++i ) {
+            triad_parted_lam(i);
+          }
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend), triad_parted_lam);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  TRIAD_PARTED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace stream
+} // end namespace rajaperf
diff --git a/src/stream/TRIAD_PARTED-OMPTarget.cpp b/src/stream/TRIAD_PARTED-OMPTarget.cpp
new file mode 100644
index 000000000..0733bc8b8
--- /dev/null
+++ b/src/stream/TRIAD_PARTED-OMPTarget.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+void TRIAD_PARTED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        #pragma omp target is_device_ptr(a, b, c) device( did )
+        #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          TRIAD_PARTED_BODY;
+        }
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          TRIAD_PARTED_BODY;
+        });
+      }
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  TRIAD_PARTED : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/stream/TRIAD_PARTED-Seq.cpp b/src/stream/TRIAD_PARTED-Seq.cpp
new file mode 100644
index 000000000..b261de348
--- /dev/null
+++ b/src/stream/TRIAD_PARTED-Seq.cpp
@@ -0,0 +1,105 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+
+void TRIAD_PARTED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_DATA_SETUP;
+
+#if defined(RUN_RAJA_SEQ)
+  auto triad_parted_lam = [=](Index_type i) {
+                     TRIAD_PARTED_BODY;
+                   };
+#endif
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          for (Index_type i = ibegin; i < iend; ++i ) {
+            TRIAD_PARTED_BODY;
+          }
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          for (Index_type i = ibegin; i < iend; ++i ) {
+            triad_parted_lam(i);
+          }
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend), triad_parted_lam);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      getCout() << "\n  TRIAD_PARTED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace stream
+} // end namespace rajaperf
diff --git a/src/stream/TRIAD_PARTED.cpp b/src/stream/TRIAD_PARTED.cpp
new file mode 100644
index 000000000..7ad85673e
--- /dev/null
+++ b/src/stream/TRIAD_PARTED.cpp
@@ -0,0 +1,100 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <algorithm>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+
+TRIAD_PARTED::TRIAD_PARTED(const RunParams& params)
+  : KernelBase(rajaperf::Stream_TRIAD_PARTED, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(1000);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  const Index_type num_parts = std::min(params.getNumParts(), getActualProblemSize());
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1*num_parts);
+  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) *
+                  getActualProblemSize() );
+  setFLOPsPerRep(2 * getActualProblemSize());
+
+  checksum_scale_factor = 0.001 *
+              ( static_cast<Checksum_type>(getDefaultProblemSize()) /
+                                           getActualProblemSize() );
+
+  m_parts.reserve(num_parts+1);
+  m_parts.emplace_back(0);
+  for (Index_type p = 1; p < num_parts; ++p) {
+    // use evenly spaced parts for now
+    m_parts.emplace_back((getActualProblemSize()/num_parts)*p +
+                         (getActualProblemSize()%num_parts)*p / num_parts);
+  }
+  m_parts.emplace_back(getActualProblemSize());
+
+  setUsesFeature( Forall );
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( Lambda_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( Lambda_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+TRIAD_PARTED::~TRIAD_PARTED()
+{
+}
+
+void TRIAD_PARTED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid);
+  allocAndInitData(m_b, getActualProblemSize(), vid);
+  allocAndInitData(m_c, getActualProblemSize(), vid);
+  initData(m_alpha, vid);
+}
+
+void TRIAD_PARTED::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor , vid);
+}
+
+void TRIAD_PARTED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_a, vid);
+  deallocData(m_b, vid);
+  deallocData(m_c, vid);
+}
+
+} // end namespace stream
+} // end namespace rajaperf
diff --git a/src/stream/TRIAD_PARTED.hpp b/src/stream/TRIAD_PARTED.hpp
new file mode 100644
index 000000000..83625b733
--- /dev/null
+++ b/src/stream/TRIAD_PARTED.hpp
@@ -0,0 +1,106 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// TRIAD_PARTED kernel reference implementation:
+///
+/// for (size_t p = 1; p < parts.size(); ++p ) {
+///   Index_type ibegin = iparts[p-1];
+///   Index_type iend = iparts[p];
+///   for (Index_type i = ibegin; i < iend; ++i ) {
+///     a[i] = b[i] + alpha * c[i] ;
+///   }
+/// }
+///
+
+#ifndef RAJAPerf_Stream_TRIAD_PARTED_HPP
+#define RAJAPerf_Stream_TRIAD_PARTED_HPP
+
+#define TRIAD_PARTED_DATA_SETUP \
+  std::vector<Index_type> parts = m_parts; \
+  \
+  Real_ptr a = m_a; \
+  Real_ptr b = m_b; \
+  Real_ptr c = m_c; \
+  Real_type alpha = m_alpha;
+
+#define TRIAD_PARTED_BODY  \
+  a[i] = b[i] + alpha * c[i] ;
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace stream
+{
+
+class TRIAD_PARTED : public KernelBase
+{
+public:
+
+  TRIAD_PARTED(const RunParams& params);
+
+  ~TRIAD_PARTED();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runKokkosVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantBlock(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantBlock(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantStream(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantStream(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantStreamOpenmp(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantStreamOpenmp(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantStreamEvent(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantStreamEvent(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantStreamEventOpenmp(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantStreamEventOpenmp(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
+  std::vector<Index_type> m_parts;
+
+  Real_ptr m_a;
+  Real_ptr m_b;
+  Real_ptr m_c;
+  Real_type m_alpha;
+};
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/stream/TRIAD_PARTED_FUSED-Cuda.cpp b/src/stream/TRIAD_PARTED_FUSED-Cuda.cpp
new file mode 100644
index 000000000..efcfbc1b5
--- /dev/null
+++ b/src/stream/TRIAD_PARTED_FUSED-Cuda.cpp
@@ -0,0 +1,976 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+#include "common/MemPool.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted_graph(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha,
+                                   Index_type ibegin, Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x + ibegin;
+  if (i < iend) {
+    TRIAD_PARTED_FUSED_BODY;
+  }
+}
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_CUDA(vid) \
+  Index_type* len_ptrs; \
+  Real_ptr*   a_ptrs; \
+  Real_ptr*   b_ptrs; \
+  Real_ptr*   c_ptrs; \
+  Real_type*  alpha_ptrs; \
+  Index_type* ibegin_ptrs; \
+  allocData(getFuserDataSpace(vid), len_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), a_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), b_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), c_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), alpha_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), ibegin_ptrs, parts.size()-1);
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_CUDA(vid) \
+  deallocData(getFuserDataSpace(vid), len_ptrs); \
+  deallocData(getFuserDataSpace(vid), a_ptrs); \
+  deallocData(getFuserDataSpace(vid), b_ptrs); \
+  deallocData(getFuserDataSpace(vid), c_ptrs); \
+  deallocData(getFuserDataSpace(vid), alpha_ptrs); \
+  deallocData(getFuserDataSpace(vid), ibegin_ptrs);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted_fused_soa(Index_type* len_ptrs, Real_ptr* a_ptrs,
+                                       Real_ptr* b_ptrs, Real_ptr* c_ptrs,
+                                       Real_type* alpha_ptrs, Index_type* ibegin_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Index_type len    = len_ptrs[j];
+  Real_ptr   a      = a_ptrs[j];
+  Real_ptr   b      = b_ptrs[j];
+  Real_ptr   c      = c_ptrs[j];
+  Real_type  alpha  = alpha_ptrs[j];
+  Index_type ibegin = ibegin_ptrs[j];
+
+  for (Index_type ii = threadIdx.x + blockIdx.x * block_size;
+       ii < len;
+       ii += block_size * gridDim.x) {
+    Index_type i = ii + ibegin;
+    TRIAD_PARTED_FUSED_BODY;
+  }
+}
+
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(vid, num_holders) \
+  triad_holder* triad_holders; \
+  allocData(getFuserDataSpace(vid), triad_holders, (num_holders));
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(vid) \
+  deallocData(getFuserDataSpace(vid), triad_holders);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted_fused_aos(triad_holder* triad_holders)
+{
+  Index_type j = blockIdx.y;
+
+  Index_type len    = triad_holders[j].len;
+  Real_ptr   a      = triad_holders[j].a;
+  Real_ptr   b      = triad_holders[j].b;
+  Real_ptr   c      = triad_holders[j].c;
+  Real_type  alpha  = triad_holders[j].alpha;
+  Index_type ibegin = triad_holders[j].ibegin;
+
+  for (Index_type ii = threadIdx.x + blockIdx.x * block_size;
+       ii < len;
+       ii += block_size * gridDim.x) {
+    Index_type i = ii + ibegin;
+    TRIAD_PARTED_FUSED_BODY;
+  }
+}
+
+using scan_index_type = RAJA::cuda_dim_member_t;
+#define WARP_SIZE 32
+#define warp_shfl(...) __shfl_sync(0xffffffff, __VA_ARGS__)
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted_fused_scan_aos(scan_index_type* first_blocks, scan_index_type num_fused,
+                                            triad_holder* triad_holders)
+{
+  scan_index_type min_j = 0;
+  scan_index_type max_j = num_fused-1;
+  scan_index_type j = (min_j + max_j + 1) / 2;
+  scan_index_type first_block = first_blocks[j];
+  while (min_j != max_j) {
+    if (first_block > blockIdx.x) {
+      max_j = j-1;
+    } else {
+      min_j = j;
+    }
+    j = (min_j + max_j + 1) / 2;
+    first_block = first_blocks[j];
+  }
+
+  Index_type len    = triad_holders[j].len;
+  Real_ptr   a      = triad_holders[j].a;
+  Real_ptr   b      = triad_holders[j].b;
+  Real_ptr   c      = triad_holders[j].c;
+  Real_type  alpha  = triad_holders[j].alpha;
+  Index_type ibegin = triad_holders[j].ibegin;
+
+  Index_type ii = threadIdx.x + (blockIdx.x - first_block) * block_size;
+  if (ii < len) {
+    Index_type i = ii + ibegin;
+    TRIAD_PARTED_FUSED_BODY;
+  }
+}
+
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantGraphReuse(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    const size_t num_holders = parts.size()-1;
+
+    cudaGraph_t graph;
+    cudaErrchk(cudaGraphCreate(&graph, 0));
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
+      constexpr size_t shmem = 0;
+      void* kernelArgs[] {(void*)&a, (void*)&b, (void*)&c, (void*)&alpha, (void*)&ibegin, (void*)&iend};
+
+      cudaKernelNodeParams params{};
+      params.func = (void *)triad_parted_graph<block_size>;
+      params.gridDim = dim3(grid_size);
+      params.blockDim = dim3(block_size);
+      params.sharedMemBytes = shmem;
+      params.kernelParams = kernelArgs;
+
+      cudaGraphNode_t node;
+      cudaErrchk(cudaGraphAddKernelNode(&node, graph, nullptr, 0, &params));
+    }
+
+
+    cudaGraphExec_t graphexec;
+    {
+      constexpr size_t errbufsize = 256;
+      cudaGraphNode_t errnode = nullptr;
+      char errbuf[errbufsize];
+      cudaErrchk(cudaGraphInstantiate(&graphexec, graph, &errnode, errbuf, errbufsize));
+    }
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      cudaErrchk( cudaGraphLaunch(graphexec, 0) );
+
+    }
+    stopTimer();
+
+    cudaErrchk(cudaGraphExecDestroy(graphexec));
+    cudaErrchk(cudaGraphDestroy(graph));
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantSOA2dSync(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_CUDA(Base_CUDA)
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Index_type index = 0;
+      Index_type len_sum = 0;
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        len_ptrs[index] = iend-ibegin;
+        a_ptrs[index] = a;
+        b_ptrs[index] = b;
+        c_ptrs[index] = c;
+        alpha_ptrs[index] = alpha;
+        ibegin_ptrs[index] = ibegin;
+        len_sum += iend-ibegin;
+        index += 1;
+      }
+      Index_type len_ave = (len_sum + index-1) / index;
+      dim3 nthreads_per_block(block_size);
+      dim3 nblocks((len_ave + block_size-1) / block_size, index);
+      constexpr size_t shmem = 0;
+      triad_parted_fused_soa<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs);
+      cudaErrchk( cudaGetLastError() );
+      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_CUDA(Base_CUDA)
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantSOA2dReuse(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_CUDA(Base_CUDA)
+
+    Index_type index = 0;
+    Index_type len_sum = 0;
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      len_ptrs[index] = iend-ibegin;
+      a_ptrs[index] = a;
+      b_ptrs[index] = b;
+      c_ptrs[index] = c;
+      alpha_ptrs[index] = alpha;
+      ibegin_ptrs[index] = ibegin;
+      len_sum += iend-ibegin;
+      index += 1;
+    }
+    Index_type len_ave = (len_sum + index-1) / index;
+    dim3 nthreads_per_block(block_size);
+    dim3 nblocks((len_ave + block_size-1) / block_size, index);
+    constexpr size_t shmem = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      triad_parted_fused_soa<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs);
+      cudaErrchk( cudaGetLastError() );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_CUDA(Base_CUDA)
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantAOS2dSync(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    const size_t num_holders = parts.size()-1;
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders)
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Index_type index = 0;
+      Index_type len_sum = 0;
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+        len_sum += iend-ibegin;
+        index += 1;
+      }
+
+      Index_type len_ave = (len_sum + index-1) / index;
+      dim3 nthreads_per_block(block_size);
+      dim3 nblocks((len_ave + block_size-1) / block_size, index);
+      constexpr size_t shmem = 0;
+      triad_parted_fused_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          triad_holders);
+      cudaErrchk( cudaGetLastError() );
+      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA)
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<block_size>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 // RAJA::indirect_function_call_dispatch
+                                 // RAJA::indirect_virtual_function_dispatch
+                                 RAJA::direct_dispatch<camp::list<RAJA::TypedRangeSegment<Index_type>, decltype(triad_parted_fused_lam)>>
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        pool.enqueue(
+            RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+            triad_parted_fused_lam );
+      }
+      workgroup group = pool.instantiate();
+      worksite site = group.run(res);
+      res.wait();
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantAOS2dPoolSync(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  const size_t pool_size = 32ull * 1024ull * 1024ull;
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    const size_t num_holders = std::max(parts.size()-1, pool_size / sizeof(triad_holder));
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders)
+
+    Index_type holder_start = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      if (holder_start+parts.size()-1 > num_holders) {
+        // synchronize when have to reuse memory
+        cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+        holder_start = 0;
+      }
+
+      Index_type num_fused = 0;
+      Index_type len_sum = 0;
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        triad_holders[holder_start+num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+        len_sum += iend-ibegin;
+        num_fused += 1;
+      }
+
+      Index_type len_ave = (len_sum + num_fused-1) / num_fused;
+      dim3 nthreads_per_block(block_size);
+      dim3 nblocks((len_ave + block_size-1) / block_size, num_fused);
+      constexpr size_t shmem = 0;
+      triad_parted_fused_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          triad_holders+holder_start);
+      cudaErrchk( cudaGetLastError() );
+      holder_start += num_fused;
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA)
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::LaggedMemPool<
+          dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>, camp::resources::Cuda>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder(pool_size, res);
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<block_size>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 // RAJA::indirect_function_call_dispatch
+                                 // RAJA::indirect_virtual_function_dispatch
+                                 RAJA::direct_dispatch<camp::list<RAJA::TypedRangeSegment<Index_type>, decltype(triad_parted_fused_lam)>>
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        pool.enqueue(
+            RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+            triad_parted_fused_lam );
+      }
+      workgroup group = pool.instantiate();
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantAOS2dReuse(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    const size_t num_holders = parts.size()-1;
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders)
+
+    Index_type index = 0;
+    Index_type len_sum = 0;
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+      len_sum += iend-ibegin;
+      index += 1;
+    }
+    Index_type len_ave = (len_sum + index-1) / index;
+    dim3 nthreads_per_block(block_size);
+    dim3 nblocks((len_ave + block_size-1) / block_size, index);
+    constexpr size_t shmem = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      triad_parted_fused_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          triad_holders);
+      cudaErrchk( cudaGetLastError() );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA)
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<block_size>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 // RAJA::indirect_function_call_dispatch
+                                 // RAJA::indirect_virtual_function_dispatch
+                                 RAJA::direct_dispatch<camp::list<RAJA::TypedRangeSegment<Index_type>, decltype(triad_parted_fused_lam)>>
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      pool.enqueue(
+          RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+          triad_parted_fused_lam );
+    }
+    workgroup group = pool.instantiate();
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantAOS2dReuseFunctionPointer(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<block_size>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      pool.enqueue(
+          RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+          triad_parted_fused_lam );
+    }
+    workgroup group = pool.instantiate();
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantAOS2dReuseVirtualFunction(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<block_size>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_virtual_function_dispatch
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      pool.enqueue(
+          RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+          triad_parted_fused_lam );
+    }
+    workgroup group = pool.instantiate();
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runCudaVariantAOSScanReuse(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    const size_t num_holders = parts.size()-1;
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders)
+    scan_index_type* first_blocks;
+    allocData(getFuserDataSpace(Base_CUDA), first_blocks, (num_holders));
+
+    Index_type num_fused = 0;
+    scan_index_type num_blocks = 0;
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      triad_holders[num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+      first_blocks[num_fused] = num_blocks;
+      num_blocks += (static_cast<scan_index_type>(iend-ibegin) +
+                     static_cast<scan_index_type>(block_size)-1) /
+                    static_cast<scan_index_type>(block_size);
+      num_fused += 1;
+    }
+    dim3 nthreads_per_block(block_size);
+    dim3 nblocks(num_blocks);
+    constexpr size_t shmem = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      triad_parted_fused_scan_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          first_blocks, num_fused, triad_holders);
+      cudaErrchk( cudaGetLastError() );
+
+    }
+    stopTimer();
+
+    deallocData(getFuserDataSpace(Base_CUDA), first_blocks);
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA)
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+void TRIAD_PARTED_FUSED::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if ( vid == Base_CUDA ) {
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runCudaVariantGraphReuse<block_size>(vid);
+
+          }
+
+          t += 1;
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runCudaVariantSOA2dSync<block_size>(vid);
+
+          }
+
+          t += 1;
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runCudaVariantSOA2dReuse<block_size>(vid);
+
+          }
+
+          t += 1;
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runCudaVariantAOSScanReuse<block_size>(vid);
+
+          }
+
+          t += 1;
+        }
+
+        if (tune_idx == t) {
+
+          setBlockSize(block_size);
+          runCudaVariantAOS2dSync<block_size>(vid);
+
+        }
+
+        t += 1;
+
+        if (tune_idx == t) {
+
+          setBlockSize(block_size);
+          runCudaVariantAOS2dPoolSync<block_size>(vid);
+
+        }
+
+        t += 1;
+
+        if (tune_idx == t) {
+
+          setBlockSize(block_size);
+          runCudaVariantAOS2dReuse<block_size>(vid);
+
+        }
+
+        t += 1;
+
+        if ( vid == RAJA_CUDA ) {
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runCudaVariantAOS2dReuseFunctionPointer<block_size>(vid);
+
+          }
+
+          t += 1;
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runCudaVariantAOS2dReuseVirtualFunction<block_size>(vid);
+
+          }
+
+          t += 1;
+        }
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void TRIAD_PARTED_FUSED::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if ( vid == Base_CUDA ) {
+          addVariantTuningName(vid, "graph_reuse_"+std::to_string(block_size));
+
+          addVariantTuningName(vid, "SOA_2d_sync_"+std::to_string(block_size));
+
+          addVariantTuningName(vid, "SOA_2d_reuse_"+std::to_string(block_size));
+
+          addVariantTuningName(vid, "AOS_scan_reuse_"+std::to_string(block_size));
+        }
+
+        addVariantTuningName(vid, "AOS_2d_sync_"+std::to_string(block_size));
+
+        addVariantTuningName(vid, "AOS_2d_poolsync_"+std::to_string(block_size));
+
+        addVariantTuningName(vid, "AOS_2d_reuse_"+std::to_string(block_size));
+
+        if ( vid == RAJA_CUDA ) {
+
+          addVariantTuningName(vid, "AOS_2d_reuse_funcptr_"+std::to_string(block_size));
+          addVariantTuningName(vid, "AOS_2d_reuse_virtfunc_"+std::to_string(block_size));
+
+        }
+
+      }
+
+    });
+
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/stream/TRIAD_PARTED_FUSED-Hip.cpp b/src/stream/TRIAD_PARTED_FUSED-Hip.cpp
new file mode 100644
index 000000000..288a4c2d9
--- /dev/null
+++ b/src/stream/TRIAD_PARTED_FUSED-Hip.cpp
@@ -0,0 +1,894 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+#include "common/MemPool.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_HIP(vid) \
+  Index_type* len_ptrs; \
+  Real_ptr*   a_ptrs; \
+  Real_ptr*   b_ptrs; \
+  Real_ptr*   c_ptrs; \
+  Real_type*  alpha_ptrs; \
+  Index_type* ibegin_ptrs; \
+  allocData(getFuserDataSpace(vid), len_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), a_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), b_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), c_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), alpha_ptrs, parts.size()-1); \
+  allocData(getFuserDataSpace(vid), ibegin_ptrs, parts.size()-1);
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_HIP(vid) \
+  deallocData(getFuserDataSpace(vid), len_ptrs); \
+  deallocData(getFuserDataSpace(vid), a_ptrs); \
+  deallocData(getFuserDataSpace(vid), b_ptrs); \
+  deallocData(getFuserDataSpace(vid), c_ptrs); \
+  deallocData(getFuserDataSpace(vid), alpha_ptrs); \
+  deallocData(getFuserDataSpace(vid), ibegin_ptrs);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted_fused_soa(Index_type* len_ptrs, Real_ptr* a_ptrs,
+                                       Real_ptr* b_ptrs, Real_ptr* c_ptrs,
+                                       Real_type* alpha_ptrs, Index_type* ibegin_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Index_type len    = len_ptrs[j];
+  Real_ptr   a      = a_ptrs[j];
+  Real_ptr   b      = b_ptrs[j];
+  Real_ptr   c      = c_ptrs[j];
+  Real_type  alpha  = alpha_ptrs[j];
+  Index_type ibegin = ibegin_ptrs[j];
+
+  for (Index_type ii = threadIdx.x + blockIdx.x * block_size;
+       ii < len;
+       ii += block_size * gridDim.x) {
+    Index_type i = ii + ibegin;
+    TRIAD_PARTED_FUSED_BODY;
+  }
+}
+
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(vid, num_holders) \
+  triad_holder* triad_holders; \
+  allocData(getFuserDataSpace(vid), triad_holders, (num_holders));
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(vid) \
+  deallocData(getFuserDataSpace(vid), triad_holders);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted_fused_aos(triad_holder* triad_holders)
+{
+  Index_type j = blockIdx.y;
+
+  Index_type len    = triad_holders[j].len;
+  Real_ptr   a      = triad_holders[j].a;
+  Real_ptr   b      = triad_holders[j].b;
+  Real_ptr   c      = triad_holders[j].c;
+  Real_type  alpha  = triad_holders[j].alpha;
+  Index_type ibegin = triad_holders[j].ibegin;
+
+  for (Index_type ii = threadIdx.x + blockIdx.x * block_size;
+       ii < len;
+       ii += block_size * gridDim.x) {
+    Index_type i = ii + ibegin;
+    TRIAD_PARTED_FUSED_BODY;
+  }
+}
+
+using scan_index_type = RAJA::hip_dim_member_t;
+#define WARP_SIZE 64
+#define warp_shfl(...) __shfl(__VA_ARGS__)
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void triad_parted_fused_scan_aos(scan_index_type* first_blocks, scan_index_type num_fused,
+                                            triad_holder* triad_holders)
+{
+  scan_index_type min_j = 0;
+  scan_index_type max_j = num_fused-1;
+  scan_index_type j = (min_j + max_j + 1) / 2;
+  scan_index_type first_block = first_blocks[j];
+  while (min_j != max_j) {
+    if (first_block > blockIdx.x) {
+      max_j = j-1;
+    } else {
+      min_j = j;
+    }
+    j = (min_j + max_j + 1) / 2;
+    first_block = first_blocks[j];
+  }
+
+  Index_type len    = triad_holders[j].len;
+  Real_ptr   a      = triad_holders[j].a;
+  Real_ptr   b      = triad_holders[j].b;
+  Real_ptr   c      = triad_holders[j].c;
+  Real_type  alpha  = triad_holders[j].alpha;
+  Index_type ibegin = triad_holders[j].ibegin;
+
+  Index_type ii = threadIdx.x + (blockIdx.x - first_block) * block_size;
+  if (ii < len) {
+    Index_type i = ii + ibegin;
+    TRIAD_PARTED_FUSED_BODY;
+  }
+}
+
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantSOA2dSync(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_HIP(Base_HIP)
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Index_type index = 0;
+      Index_type len_sum = 0;
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        len_ptrs[index] = iend-ibegin;
+        a_ptrs[index] = a;
+        b_ptrs[index] = b;
+        c_ptrs[index] = c;
+        alpha_ptrs[index] = alpha;
+        ibegin_ptrs[index] = ibegin;
+        len_sum += iend-ibegin;
+        index += 1;
+      }
+      Index_type len_ave = (len_sum + index-1) / index;
+      dim3 nthreads_per_block(block_size);
+      dim3 nblocks((len_ave + block_size-1) / block_size, index);
+      constexpr size_t shmem = 0;
+      triad_parted_fused_soa<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs);
+      hipErrchk( hipGetLastError() );
+      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_HIP(Base_HIP)
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantSOA2dReuse(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_HIP(Base_HIP)
+
+    Index_type index = 0;
+    Index_type len_sum = 0;
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      len_ptrs[index] = iend-ibegin;
+      a_ptrs[index] = a;
+      b_ptrs[index] = b;
+      c_ptrs[index] = c;
+      alpha_ptrs[index] = alpha;
+      ibegin_ptrs[index] = ibegin;
+      len_sum += iend-ibegin;
+      index += 1;
+    }
+    Index_type len_ave = (len_sum + index-1) / index;
+    dim3 nthreads_per_block(block_size);
+    dim3 nblocks((len_ave + block_size-1) / block_size, index);
+    constexpr size_t shmem = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      triad_parted_fused_soa<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs);
+      hipErrchk( hipGetLastError() );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_HIP(Base_HIP)
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantAOS2dSync(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    const size_t num_holders = parts.size()-1;
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders)
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Index_type index = 0;
+      Index_type len_sum = 0;
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+        len_sum += iend-ibegin;
+        index += 1;
+      }
+
+      Index_type len_ave = (len_sum + index-1) / index;
+      dim3 nthreads_per_block(block_size);
+      dim3 nblocks((len_ave + block_size-1) / block_size, index);
+      constexpr size_t shmem = 0;
+      triad_parted_fused_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          triad_holders);
+      hipErrchk( hipGetLastError() );
+      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP)
+
+  } else if ( vid == RAJA_HIP ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_HIP)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<block_size>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 // RAJA::indirect_function_call_dispatch
+                                 // RAJA::indirect_virtual_function_dispatch
+                                 RAJA::direct_dispatch<camp::list<RAJA::TypedRangeSegment<Index_type>, decltype(triad_parted_fused_lam)>>
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        pool.enqueue(
+            RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+            triad_parted_fused_lam );
+      }
+      workgroup group = pool.instantiate();
+      worksite site = group.run(res);
+      res.wait();
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantAOS2dPoolSync(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  const size_t pool_size = 32ull * 1024ull * 1024ull;
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    const size_t num_holders = std::max(parts.size()-1, pool_size / sizeof(triad_holder));
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders)
+
+    Index_type holder_start = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      if (holder_start+parts.size()-1 > num_holders) {
+        // synchronize when have to reuse memory
+        hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+        holder_start = 0;
+      }
+
+      Index_type num_fused = 0;
+      Index_type len_sum = 0;
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        triad_holders[holder_start+num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+        len_sum += iend-ibegin;
+        num_fused += 1;
+      }
+
+      Index_type len_ave = (len_sum + num_fused-1) / num_fused;
+      dim3 nthreads_per_block(block_size);
+      dim3 nblocks((len_ave + block_size-1) / block_size, num_fused);
+      constexpr size_t shmem = 0;
+      triad_parted_fused_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          triad_holders+holder_start);
+      hipErrchk( hipGetLastError() );
+      holder_start += num_fused;
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP)
+
+  } else if ( vid == RAJA_HIP ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::LaggedMemPool<
+          dataspace_allocator<getFuserDataSpace(RAJA_HIP)>, camp::resources::Hip>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder(pool_size, res);
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<block_size>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 // RAJA::indirect_function_call_dispatch
+                                 // RAJA::indirect_virtual_function_dispatch
+                                 RAJA::direct_dispatch<camp::list<RAJA::TypedRangeSegment<Index_type>, decltype(triad_parted_fused_lam)>>
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        pool.enqueue(
+            RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+            triad_parted_fused_lam );
+      }
+      workgroup group = pool.instantiate();
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantAOS2dReuse(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    const size_t num_holders = parts.size()-1;
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders)
+
+    Index_type index = 0;
+    Index_type len_sum = 0;
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+      len_sum += iend-ibegin;
+      index += 1;
+    }
+    Index_type len_ave = (len_sum + index-1) / index;
+    dim3 nthreads_per_block(block_size);
+    dim3 nblocks((len_ave + block_size-1) / block_size, index);
+    constexpr size_t shmem = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      triad_parted_fused_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          triad_holders);
+      hipErrchk( hipGetLastError() );
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP)
+
+  } else if ( vid == RAJA_HIP ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_HIP)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<block_size>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 // RAJA::indirect_function_call_dispatch
+                                 // RAJA::indirect_virtual_function_dispatch
+                                 RAJA::direct_dispatch<camp::list<RAJA::TypedRangeSegment<Index_type>, decltype(triad_parted_fused_lam)>>
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      pool.enqueue(
+          RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+          triad_parted_fused_lam );
+    }
+    workgroup group = pool.instantiate();
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantAOS2dReuseFunctionPointer(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_HIP)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<block_size>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch
+                                 // RAJA::indirect_virtual_function_dispatch
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      pool.enqueue(
+          RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+          triad_parted_fused_lam );
+    }
+    workgroup group = pool.instantiate();
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantAOS2dReuseVirtualFunction(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    auto triad_parted_fused_lam = [=] __device__ (Index_type i) {
+          TRIAD_PARTED_FUSED_BODY;
+        };
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_HIP)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<block_size>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_virtual_function_dispatch
+                                >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      pool.enqueue(
+          RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+          triad_parted_fused_lam );
+    }
+    workgroup group = pool.instantiate();
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      worksite site = group.run(res);
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size >
+void TRIAD_PARTED_FUSED::runHipVariantAOSScanReuse(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    const size_t num_holders = parts.size()-1;
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders)
+    scan_index_type* first_blocks;
+    allocData(getFuserDataSpace(Base_HIP), first_blocks, (num_holders));
+
+    Index_type num_fused = 0;
+    scan_index_type num_blocks = 0;
+
+    for (size_t p = 1; p < parts.size(); ++p ) {
+      const Index_type ibegin = parts[p-1];
+      const Index_type iend = parts[p];
+
+      triad_holders[num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+      first_blocks[num_fused] = num_blocks;
+      num_blocks += (static_cast<scan_index_type>(iend-ibegin) +
+                     static_cast<scan_index_type>(block_size)-1) /
+                    static_cast<scan_index_type>(block_size);
+      num_fused += 1;
+    }
+    dim3 nthreads_per_block(block_size);
+    dim3 nblocks(num_blocks);
+    constexpr size_t shmem = 0;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      triad_parted_fused_scan_aos<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(
+          first_blocks, num_fused, triad_holders);
+      hipErrchk( hipGetLastError() );
+
+    }
+    stopTimer();
+
+    deallocData(getFuserDataSpace(Base_HIP), first_blocks);
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP)
+
+  } else {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+void TRIAD_PARTED_FUSED::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if ( vid == Base_HIP ) {
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runHipVariantSOA2dSync<block_size>(vid);
+
+          }
+
+          t += 1;
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runHipVariantSOA2dReuse<block_size>(vid);
+
+          }
+
+          t += 1;
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runHipVariantAOSScanReuse<block_size>(vid);
+
+          }
+
+          t += 1;
+        }
+
+        if (tune_idx == t) {
+
+          setBlockSize(block_size);
+          runHipVariantAOS2dSync<block_size>(vid);
+
+        }
+
+        t += 1;
+
+        if (tune_idx == t) {
+
+          setBlockSize(block_size);
+          runHipVariantAOS2dPoolSync<block_size>(vid);
+
+        }
+
+        t += 1;
+
+        if (tune_idx == t) {
+
+          setBlockSize(block_size);
+          runHipVariantAOS2dReuse<block_size>(vid);
+
+        }
+
+        t += 1;
+
+        if ( vid == RAJA_HIP ) {
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runHipVariantAOS2dReuseFunctionPointer<block_size>(vid);
+
+          }
+
+          t += 1;
+
+          if (tune_idx == t) {
+
+            setBlockSize(block_size);
+            runHipVariantAOS2dReuseVirtualFunction<block_size>(vid);
+
+          }
+
+          t += 1;
+        }
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void TRIAD_PARTED_FUSED::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if ( vid == Base_HIP ) {
+          addVariantTuningName(vid, "SOA_2d_sync_"+std::to_string(block_size));
+
+          addVariantTuningName(vid, "SOA_2d_reuse_"+std::to_string(block_size));
+
+          addVariantTuningName(vid, "AOS_scan_reuse_"+std::to_string(block_size));
+        }
+
+        addVariantTuningName(vid, "AOS_2d_sync_"+std::to_string(block_size));
+
+        addVariantTuningName(vid, "AOS_2d_poolsync_"+std::to_string(block_size));
+
+        addVariantTuningName(vid, "AOS_2d_reuse_"+std::to_string(block_size));
+
+        if ( vid == RAJA_HIP ) {
+
+          addVariantTuningName(vid, "AOS_2d_reuse_funcptr_"+std::to_string(block_size));
+          addVariantTuningName(vid, "AOS_2d_reuse_virtfunc_"+std::to_string(block_size));
+
+        }
+      }
+
+    });
+
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/stream/TRIAD_PARTED_FUSED-OMP.cpp b/src/stream/TRIAD_PARTED_FUSED-OMP.cpp
new file mode 100644
index 000000000..73a818b6a
--- /dev/null
+++ b/src/stream/TRIAD_PARTED_FUSED-OMP.cpp
@@ -0,0 +1,210 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/MemPool.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+
+void TRIAD_PARTED_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type index = 0;
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+          index += 1;
+        }
+
+#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL)
+        #pragma omp parallel
+        #pragma omp single nowait
+        for (Index_type j = 0; j < index; j++) {
+          #pragma omp task firstprivate(j)
+          {
+            Index_type len    = triad_holders[j].len;
+            Real_ptr   a      = triad_holders[j].a;
+            Real_ptr   b      = triad_holders[j].b;
+            Real_ptr   c      = triad_holders[j].c;
+            Real_type  alpha  = triad_holders[j].alpha;
+            Index_type ibegin = triad_holders[j].ibegin;
+            for (Index_type ii = 0; ii < len; ++ii ) {
+              Index_type i = ii + ibegin;
+              TRIAD_PARTED_FUSED_BODY;
+            }
+          }
+        }
+#else
+        #pragma omp parallel for
+        for (Index_type j = 0; j < index; j++) {
+          Index_type len    = triad_holders[j].len;
+          Real_ptr   a      = triad_holders[j].a;
+          Real_ptr   b      = triad_holders[j].b;
+          Real_ptr   c      = triad_holders[j].c;
+          Real_type  alpha  = triad_holders[j].alpha;
+          Index_type ibegin = triad_holders[j].ibegin;
+          for (Index_type ii = 0; ii < len; ++ii ) {
+            Index_type i = ii + ibegin;
+            TRIAD_PARTED_FUSED_BODY;
+          }
+        }
+#endif
+
+      }
+      stopTimer();
+
+      TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN;
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type index = 0;
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          new(&lambdas[index]) lambda_type(make_lambda(a, b, c, alpha, ibegin));
+          lens[index] = iend-ibegin;
+          index += 1;
+        }
+
+#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL)
+        #pragma omp parallel
+        #pragma omp single nowait
+        for (Index_type j = 0; j < index; j++) {
+          #pragma omp task firstprivate(j)
+          {
+            auto       lambda = lambdas[j];
+            Index_type len    = lens[j];
+            for (Index_type ii = 0; ii < len; ii++) {
+              lambda(ii);
+            }
+          }
+        }
+#else
+        #pragma omp parallel for
+        for (Index_type j = 0; j < index; j++) {
+          auto       lambda = lambdas[j];
+          Index_type len    = lens[j];
+          for (Index_type ii = 0; ii < len; ii++) {
+            lambda(ii);
+          }
+        }
+#endif
+
+
+      }
+      stopTimer();
+
+      TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMP)>>>;
+      using Allocator = AllocatorHolder::Allocator<char>;
+
+      AllocatorHolder allocatorHolder;
+
+      using workgroup_policy = RAJA::WorkGroupPolicy <
+                                   RAJA::omp_work,
+                                   RAJA::ordered,
+                                   RAJA::constant_stride_array_of_objects >;
+
+      using workpool = RAJA::WorkPool< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                         Index_type,
+                                         RAJA::xargs<>,
+                                         Allocator >;
+
+      using worksite = RAJA::WorkSite< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      workpool pool  (allocatorHolder.template getAllocator<char>());
+      pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          auto triad_parted_fused_lam = [=](Index_type i) {
+                TRIAD_PARTED_FUSED_BODY;
+              };
+
+          pool.enqueue(
+              RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+              triad_parted_fused_lam );
+        }
+        workgroup group = pool.instantiate();
+        worksite site = group.run();
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace stream
+} // end namespace rajaperf
diff --git a/src/stream/TRIAD_PARTED_FUSED-OMPTarget.cpp b/src/stream/TRIAD_PARTED_FUSED-OMPTarget.cpp
new file mode 100644
index 000000000..2eeee4f24
--- /dev/null
+++ b/src/stream/TRIAD_PARTED_FUSED-OMPTarget.cpp
@@ -0,0 +1,154 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+#include "common/MemPool.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \
+  TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP \
+  triad_holder* triad_holders; \
+  allocData(DataSpace::OmpTarget, triad_holders, (parts.size()-1));
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_COPY_OMP_TARGET \
+  initOpenMPDeviceData(omp_triad_holders, triad_holders, index*sizeof(triad_holder));
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \
+  deallocData(DataSpace::OmpTarget, triad_holders); \
+  TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN
+
+void TRIAD_PARTED_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Index_type index = 0;
+      Index_type len_sum = 0;
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+        len_sum += iend-ibegin;
+        index += 1;
+      }
+
+      TRIAD_PARTED_FUSED_MANUAL_FUSER_COPY_OMP_TARGET;
+      Index_type len_ave = (len_sum + index-1) / index;
+      #pragma omp target is_device_ptr(a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs, len_ptrs) device( did )
+      #pragma omp teams distribute parallel for collapse(2) schedule(static, 1)
+      for (Index_type j = 0; j < index; j++) {
+        for (Index_type iii = 0; iii < len_ave; iii++) {
+
+          Index_type len    = omp_triad_holders[j].len;
+          Real_ptr   a      = omp_triad_holders[j].a;
+          Real_ptr   b      = omp_triad_holders[j].b;
+          Real_ptr   c      = omp_triad_holders[j].c;
+          Real_type  alpha  = omp_triad_holders[j].alpha;
+          Index_type ibegin = omp_triad_holders[j].ibegin;
+
+          for (Index_type ii = iii; ii < len; ii += len_ave) {
+            Index_type i = ii + ibegin;
+            TRIAD_PARTED_FUSED_BODY;
+          }
+        }
+      }
+
+    }
+    stopTimer();
+
+    TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMPTarget)>>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::omp_target_work /*<threads_per_team>*/,
+                                 RAJA::ordered,
+                                 RAJA::constant_stride_array_of_objects >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool(allocatorHolder.template getAllocator<char>());
+    pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (size_t p = 1; p < parts.size(); ++p ) {
+        const Index_type ibegin = parts[p-1];
+        const Index_type iend = parts[p];
+
+        auto triad_parted_fused_lam = [=](Index_type i) {
+              TRIAD_PARTED_FUSED_BODY;
+            };
+
+        pool.enqueue(
+            RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+            triad_parted_fused_lam );
+      }
+      workgroup group = pool.instantiate();
+      worksite site = group.run();
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  TRIAD_PARTED_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/stream/TRIAD_PARTED_FUSED-Seq.cpp b/src/stream/TRIAD_PARTED_FUSED-Seq.cpp
new file mode 100644
index 000000000..2def10ff0
--- /dev/null
+++ b/src/stream/TRIAD_PARTED_FUSED-Seq.cpp
@@ -0,0 +1,169 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/MemPool.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+
+void TRIAD_PARTED_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  TRIAD_PARTED_FUSED_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type index = 0;
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin};
+          index += 1;
+        }
+
+        for (Index_type j = 0; j < index; j++) {
+          Index_type len    = triad_holders[j].len;
+          Real_ptr   a      = triad_holders[j].a;
+          Real_ptr   b      = triad_holders[j].b;
+          Real_ptr   c      = triad_holders[j].c;
+          Real_type  alpha  = triad_holders[j].alpha;
+          Index_type ibegin = triad_holders[j].ibegin;
+          for (Index_type ii = 0; ii < len; ++ii ) {
+            Index_type i = ii + ibegin;
+            TRIAD_PARTED_FUSED_BODY;
+          }
+        }
+
+      }
+      stopTimer();
+
+      TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN;
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type index = 0;
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          new(&lambdas[index]) lambda_type(make_lambda(a, b, c, alpha, ibegin));
+          lens[index] = iend-ibegin;
+          index += 1;
+        }
+
+        for (Index_type j = 0; j < index; j++) {
+          auto       lambda = lambdas[j];
+          Index_type len    = lens[j];
+          for (Index_type ii = 0; ii < len; ii++) {
+            lambda(ii);
+          }
+        }
+
+      }
+      stopTimer();
+
+      TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      using AllocatorHolder = RAJAPoolAllocatorHolder<
+        rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_Seq)>>>;
+      using Allocator = AllocatorHolder::Allocator<char>;
+
+      AllocatorHolder allocatorHolder;
+
+      using workgroup_policy = RAJA::WorkGroupPolicy <
+                                   RAJA::seq_work,
+                                   RAJA::ordered,
+                                   RAJA::constant_stride_array_of_objects >;
+
+      using workpool = RAJA::WorkPool< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                         Index_type,
+                                         RAJA::xargs<>,
+                                         Allocator >;
+
+      using worksite = RAJA::WorkSite< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      workpool pool(allocatorHolder.template getAllocator<char>());
+      pool.reserve(parts.size()-1, 1024ull*1024ull);
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (size_t p = 1; p < parts.size(); ++p ) {
+          const Index_type ibegin = parts[p-1];
+          const Index_type iend = parts[p];
+
+          auto triad_parted_fused_lam = [=](Index_type i) {
+                TRIAD_PARTED_FUSED_BODY;
+              };
+
+          pool.enqueue(
+              RAJA::TypedRangeSegment<Index_type>(ibegin, iend),
+              triad_parted_fused_lam );
+        }
+        workgroup group = pool.instantiate();
+        worksite site = group.run();
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      getCout() << "\n  TRIAD_PARTED_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace stream
+} // end namespace rajaperf
diff --git a/src/stream/TRIAD_PARTED_FUSED.cpp b/src/stream/TRIAD_PARTED_FUSED.cpp
new file mode 100644
index 000000000..5c0011305
--- /dev/null
+++ b/src/stream/TRIAD_PARTED_FUSED.cpp
@@ -0,0 +1,91 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD_PARTED_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <algorithm>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+
+TRIAD_PARTED_FUSED::TRIAD_PARTED_FUSED(const RunParams& params)
+  : KernelBase(rajaperf::Stream_TRIAD_PARTED_FUSED, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(1000);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  const Index_type num_parts = std::min(params.getNumParts(), getActualProblemSize());
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) *
+                  getActualProblemSize() );
+  setFLOPsPerRep(2 * getActualProblemSize());
+
+  checksum_scale_factor = 0.001 *
+              ( static_cast<Checksum_type>(getDefaultProblemSize()) /
+                                           getActualProblemSize() );
+
+  m_parts = params.getPartition(getActualProblemSize(), num_parts);
+
+  setUsesFeature( Workgroup );
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+TRIAD_PARTED_FUSED::~TRIAD_PARTED_FUSED()
+{
+}
+
+void TRIAD_PARTED_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid);
+  allocAndInitData(m_b, getActualProblemSize(), vid);
+  allocAndInitData(m_c, getActualProblemSize(), vid);
+  initData(m_alpha, vid);
+}
+
+void TRIAD_PARTED_FUSED::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor , vid);
+}
+
+void TRIAD_PARTED_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_a, vid);
+  deallocData(m_b, vid);
+  deallocData(m_c, vid);
+}
+
+} // end namespace stream
+} // end namespace rajaperf
diff --git a/src/stream/TRIAD_PARTED_FUSED.hpp b/src/stream/TRIAD_PARTED_FUSED.hpp
new file mode 100644
index 000000000..26ef36aa3
--- /dev/null
+++ b/src/stream/TRIAD_PARTED_FUSED.hpp
@@ -0,0 +1,153 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// TRIAD_PARTED_FUSED kernel reference implementation:
+///
+/// for (size_t p = 1; p < parts.size(); ++p ) {
+///   Index_type ibegin = iparts[p-1];
+///   Index_type iend = iparts[p];
+///   for (Index_type i = ibegin; i < iend; ++i ) {
+///     a[i] = b[i] + alpha * c[i] ;
+///   }
+/// }
+///
+
+#ifndef RAJAPerf_Stream_TRIAD_PARTED_FUSED_HPP
+#define RAJAPerf_Stream_TRIAD_PARTED_FUSED_HPP
+
+#define TRIAD_PARTED_FUSED_DATA_SETUP \
+  std::vector<Index_type> parts = m_parts; \
+  \
+  Real_ptr a = m_a; \
+  Real_ptr b = m_b; \
+  Real_ptr c = m_c; \
+  Real_type alpha = m_alpha;
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP \
+  triad_holder* triad_holders = new triad_holder[parts.size()-1];
+
+#define TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN \
+  delete[] triad_holders;
+
+
+#define TRIAD_PARTED_FUSED_BODY  \
+  a[i] = b[i] + alpha * c[i] ;
+
+
+#define TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_SETUP \
+  auto make_lambda = [](Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, Index_type ibegin) { \
+    return [=](Index_type ii) { \
+      Index_type i = ii + ibegin; \
+      TRIAD_PARTED_FUSED_BODY; \
+    }; \
+  }; \
+  using lambda_type = decltype(make_lambda(Real_ptr(), Real_ptr(), Real_ptr(), Real_type(), Index_type())); \
+  lambda_type* lambdas = reinterpret_cast<lambda_type*>( \
+      malloc(sizeof(lambda_type) * (parts.size()-1))); \
+  Index_type* lens = new Index_type[parts.size()-1];
+
+#define TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \
+  free(lambdas); \
+  delete[] lens;
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace stream
+{
+
+struct alignas(2*alignof(void*)) triad_holder {
+  Index_type len;
+  Real_ptr a;
+  Real_ptr b;
+  Real_ptr c;
+  Real_type alpha;
+  Index_type ibegin;
+};
+
+class TRIAD_PARTED_FUSED : public KernelBase
+{
+public:
+
+  TRIAD_PARTED_FUSED(const RunParams& params);
+
+  ~TRIAD_PARTED_FUSED();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runKokkosVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantGraphReuse(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantSOA2dSync(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantSOA2dSync(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantSOA2dReuse(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantSOA2dReuse(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantAOS2dSync(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantAOS2dSync(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantAOS2dPoolSync(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantAOS2dPoolSync(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantAOS2dReuse(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantAOS2dReuse(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantAOS2dReuseFunctionPointer(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantAOS2dReuseFunctionPointer(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantAOS2dReuseVirtualFunction(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantAOS2dReuseVirtualFunction(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantAOSScanReuse(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantAOSScanReuse(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
+  std::vector<Index_type> m_parts;
+
+  Real_ptr m_a;
+  Real_ptr m_b;
+  Real_ptr m_c;
+  Real_type m_alpha;
+};
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard