diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8091a6df8..0d9a6c6e1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -221,6 +221,12 @@ blt_add_executable( stream/TRIAD.cpp stream/TRIAD-Seq.cpp stream/TRIAD-OMPTarget.cpp + stream/TRIAD_PARTED.cpp + stream/TRIAD_PARTED-Seq.cpp + stream/TRIAD_PARTED-OMPTarget.cpp + stream/TRIAD_PARTED_FUSED.cpp + stream/TRIAD_PARTED_FUSED-Seq.cpp + stream/TRIAD_PARTED_FUSED-OMPTarget.cpp common/DataUtils.cpp common/Executor.cpp common/KernelBase.cpp diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp index a9d161183..e82795553 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp @@ -13,6 +13,7 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" +#include "common/MemPool.hpp" #include @@ -21,33 +22,33 @@ namespace rajaperf namespace comm { -#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA(vid) \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, pack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, pack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, pack_len_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, unpack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); - -#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ - deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ - deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ - deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ - deallocData(DataSpace::CudaPinned, pack_len_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_len_ptrs); + allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_len_ptrs, num_neighbors * num_vars); + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(vid) \ + deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_len_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) @@ -99,7 +100,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) if ( vid == Base_CUDA ) { - HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA(Base_CUDA); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -198,7 +199,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) } stopTimer(); - HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(Base_CUDA); } else { getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; @@ -216,7 +217,8 @@ void HALO_EXCHANGE_FUSED::runCudaVariantWorkGroup(VariantID vid) if ( vid == RAJA_CUDA ) { - using AllocatorHolder = RAJAPoolAllocatorHolder; + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp index 2ac30479b..e297eede2 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp @@ -13,6 +13,7 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP) #include "common/HipDataUtils.hpp" +#include "common/MemPool.hpp" #include @@ -21,33 +22,33 @@ namespace rajaperf namespace comm { -#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP(vid) \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); - -#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ - deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs); + allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_len_ptrs, num_neighbors * num_vars); + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP(vid) \ + deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_len_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) @@ -99,7 +100,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) if ( vid == Base_HIP ) { - HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP(Base_HIP); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -198,7 +199,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) } stopTimer(); - HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP(Base_HIP); } else { getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; @@ -216,7 +217,8 @@ void HALO_EXCHANGE_FUSED::runHipVariantWorkGroup(VariantID vid) if ( vid == RAJA_HIP ) { - using AllocatorHolder = RAJAPoolAllocatorHolder; + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp index 1af5d4bb9..08dae9d81 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp @@ -12,6 +12,8 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "common/MemPool.hpp" + #include namespace rajaperf @@ -307,7 +309,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid) case RAJA_OpenMP : { using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp index 18c32437d..0bace7a6e 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp @@ -13,6 +13,7 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP) #include "common/OpenMPTargetDataUtils.hpp" +#include "common/MemPool.hpp" #include @@ -197,7 +198,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) if ( vid == RAJA_OpenMPTarget ) { using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp index bca51de0d..95452b833 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp @@ -12,6 +12,8 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "common/MemPool.hpp" + #include namespace rajaperf @@ -231,7 +233,7 @@ void HALO_EXCHANGE_FUSED::runSeqVariantWorkGroup(VariantID vid) case RAJA_Seq : { using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_PACKING_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp index 7541a30ef..de4cb8252 100644 --- a/src/comm/HALO_PACKING_FUSED-Cuda.cpp +++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp @@ -13,6 +13,7 @@ #if defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" +#include "common/MemPool.hpp" #include @@ -21,33 +22,33 @@ namespace rajaperf namespace comm { -#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA(vid) \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, pack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, pack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, pack_len_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, unpack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); - -#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ - deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ - deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ - deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ - deallocData(DataSpace::CudaPinned, pack_len_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \ - deallocData(DataSpace::CudaPinned, unpack_len_ptrs); + allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_len_ptrs, num_neighbors * num_vars); + +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(vid) \ + deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_len_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) @@ -103,7 +104,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid) if ( vid == Base_CUDA ) { - HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA(Base_CUDA); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -189,7 +190,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid) } stopTimer(); - HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(Base_CUDA); } else { getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl; @@ -207,7 +208,8 @@ void HALO_PACKING_FUSED::runCudaVariantWorkGroup(VariantID vid) if ( vid == RAJA_CUDA ) { - using AllocatorHolder = RAJAPoolAllocatorHolder; + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_PACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp index 7b4d9b064..43c131144 100644 --- a/src/comm/HALO_PACKING_FUSED-Hip.cpp +++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp @@ -13,6 +13,7 @@ #if defined(RAJA_ENABLE_HIP) #include "common/HipDataUtils.hpp" +#include "common/MemPool.hpp" #include @@ -21,33 +22,33 @@ namespace rajaperf namespace comm { -#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP(vid) \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); - -#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ - deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \ - deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs); + allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(getFuserDataSpace(vid), unpack_len_ptrs, num_neighbors * num_vars); + +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP(vid) \ + deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), pack_len_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \ + deallocData(getFuserDataSpace(vid), unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) @@ -103,7 +104,7 @@ void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid) if ( vid == Base_HIP ) { - HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP(Base_HIP); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -189,7 +190,7 @@ void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid) } stopTimer(); - HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP(Base_HIP); } else { getCout() << "\n HALO_PACKING_FUSED : Unknown Hip variant id = " << vid << std::endl; @@ -207,7 +208,8 @@ void HALO_PACKING_FUSED::runHipVariantWorkGroup(VariantID vid) if ( vid == RAJA_HIP ) { - using AllocatorHolder = RAJAPoolAllocatorHolder; + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_PACKING_FUSED-OMP.cpp b/src/comm/HALO_PACKING_FUSED-OMP.cpp index 143a65501..dd854cee5 100644 --- a/src/comm/HALO_PACKING_FUSED-OMP.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMP.cpp @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#include "common/MemPool.hpp" + #include namespace rajaperf @@ -275,7 +277,7 @@ void HALO_PACKING_FUSED::runOpenMPVariantWorkGroup(VariantID vid) case RAJA_OpenMP : { using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp index ab0b075b4..8ca1d4ef5 100644 --- a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp @@ -13,6 +13,7 @@ #if defined(RAJA_ENABLE_TARGET_OPENMP) #include "common/OpenMPTargetDataUtils.hpp" +#include "common/MemPool.hpp" #include @@ -182,7 +183,7 @@ void HALO_PACKING_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) if ( vid == RAJA_OpenMPTarget ) { using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/comm/HALO_PACKING_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp index 2b25adcd0..ae3d6a6f1 100644 --- a/src/comm/HALO_PACKING_FUSED-Seq.cpp +++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#include "common/MemPool.hpp" + #include namespace rajaperf @@ -199,7 +201,7 @@ void HALO_PACKING_FUSED::runSeqVariantWorkGroup(VariantID vid) case RAJA_Seq : { using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; + rajaperf::basic_mempool::MemPool>>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 225d8233f..f1dbefd19 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -403,6 +403,26 @@ inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size } +/*! + * \brief Allocator type for basic_mempool using DataSpaces. + */ +template < DataSpace dataSpace > +struct dataspace_allocator { + + // returns a valid pointer on success, nullptr on failure + void* malloc(size_t nbytes) + { + return detail::allocData(dataSpace, nbytes, alignof(std::max_align_t)); + } + + // returns true on success, false on failure + bool free(void* ptr) + { + detail::deallocData(dataSpace, ptr); + return true; + } +}; + /*! * \brief Holds a RajaPool object and provides access to it via a * std allocator compliant type. @@ -497,6 +517,11 @@ struct RAJAPoolAllocatorHolder { } }; + template < typename ... Ts > + RAJAPoolAllocatorHolder(Ts&&... args) + : m_pool(std::forward(args)...) + { } + template < typename T > Allocator getAllocator() { diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 32f32f64b..df23ca225 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -264,6 +264,46 @@ class KernelBase DataSpace getReductionDataSpace(VariantID vid) const; DataSpace getMPIDataSpace(VariantID vid) const; + /*! + * \brief Get DataSpace to use with fusers for given variant. + */ + static constexpr DataSpace getFuserDataSpace(VariantID vid) + { + switch(vid) + { + case VariantID::Base_Seq: + case VariantID::Lambda_Seq: + case VariantID::RAJA_Seq: + return DataSpace::Host; + + case VariantID::Base_OpenMP: + case VariantID::Lambda_OpenMP: + case VariantID::RAJA_OpenMP: + return DataSpace::Host; + + case VariantID::Base_OpenMPTarget: + case VariantID::RAJA_OpenMPTarget: + return DataSpace::Host; + + case VariantID::Base_CUDA: + case VariantID::Lambda_CUDA: + case VariantID::RAJA_CUDA: + return DataSpace::CudaPinned; + + case VariantID::Base_HIP: + case VariantID::Lambda_HIP: + case VariantID::RAJA_HIP: + return DataSpace::HipPinnedCoarse; + + case VariantID::Kokkos_Lambda: + return DataSpace::Host; + + case VariantID::NumVariants: + return DataSpace::Host; + } + return DataSpace::Host; + } + template void allocData(DataSpace dataSpace, T& ptr, Size_type len) { diff --git a/src/common/MemPool.hpp b/src/common/MemPool.hpp new file mode 100644 index 000000000..03b2e755a --- /dev/null +++ b/src/common/MemPool.hpp @@ -0,0 +1,450 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// This file is taken from RAJA/util/basic_mempool.hpp +// and modified to support resources +// + +#ifndef RAJAPerf_BASIC_MEMPOOL_HPP +#define RAJAPerf_BASIC_MEMPOOL_HPP + +#include +#include +#include +#include +#include +#include + +#include "RAJA/util/align.hpp" +#include "RAJA/util/mutex.hpp" + +namespace rajaperf +{ + +namespace basic_mempool +{ + +namespace detail +{ + + +/*! \class MemoryArena + ****************************************************************************** + * + * \brief MemoryArena is a map based subclass for class MemPool + * provides book-keeping to divy a large chunk of pre-allocated memory to avoid + * the overhead of malloc/free or cudaMalloc/cudaFree, etc + * + * get/give are the primary calls used by class MemPool to get aligned memory + * from the pool or give it back + * + * + ****************************************************************************** + */ +class MemoryArena +{ +public: + using free_type = std::map; + using free_value_type = typename free_type::value_type; + using used_type = std::map; + using used_value_type = typename used_type::value_type; + + MemoryArena(void* ptr, size_t size) + : m_allocation{ ptr, static_cast(ptr)+size }, + m_free_space(), + m_used_space() + { + m_free_space[ptr] = static_cast(ptr)+size ; + if (m_allocation.begin == nullptr) { + fprintf(stderr, "Attempt to create MemoryArena with no memory"); + std::abort(); + } + } + + MemoryArena(MemoryArena const&) = delete; + MemoryArena& operator=(MemoryArena const&) = delete; + + MemoryArena(MemoryArena&&) = default; + MemoryArena& operator=(MemoryArena&&) = default; + + size_t capacity() + { + return static_cast(m_allocation.end) - + static_cast(m_allocation.begin); + } + + bool unused() { return m_used_space.empty(); } + + void* get_allocation() { return m_allocation.begin; } + + void* get(size_t nbytes, size_t alignment) + { + void* ptr_out = nullptr; + if (capacity() >= nbytes) { + free_type::iterator end = m_free_space.end(); + for (free_type::iterator iter = m_free_space.begin(); iter != end; + ++iter) { + + void* adj_ptr = iter->first; + size_t cap = + static_cast(iter->second) - static_cast(adj_ptr); + + if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) { + + ptr_out = adj_ptr; + + remove_free_chunk(iter, + adj_ptr, + static_cast(adj_ptr) + nbytes); + + add_used_chunk(adj_ptr, static_cast(adj_ptr) + nbytes); + + break; + } + } + } + return ptr_out; + } + + bool give(void* ptr) + { + if (m_allocation.begin <= ptr && ptr < m_allocation.end) { + + used_type::iterator found = m_used_space.find(ptr); + + if (found != m_used_space.end()) { + + add_free_chunk(found->first, found->second); + + m_used_space.erase(found); + + } else { + fprintf(stderr, "Invalid free %p", ptr); + std::abort(); + } + + return true; + } else { + return false; + } + } + +private: + struct memory_chunk { + void* begin; + void* end; + }; + + void add_free_chunk(void* begin, void* end) + { + // integrates a chunk of memory into free_space + free_type::iterator invl = m_free_space.end(); + free_type::iterator next = m_free_space.lower_bound(begin); + + // check if prev exists + if (next != m_free_space.begin()) { + // check if prev can cover [begin, end) + free_type::iterator prev = next; + --prev; + if (prev->second == begin) { + // extend prev to cover [begin, end) + prev->second = end; + + // check if prev can cover next too + if (next != invl) { + assert(next->first != begin); + + if (next->first == end) { + // extend prev to cover next too + prev->second = next->second; + + // remove redundant next + m_free_space.erase(next); + } + } + return; + } + } + + if (next != invl) { + assert(next->first != begin); + + if (next->first == end) { + // extend next to cover [begin, end) + m_free_space.insert(next, free_value_type{begin, next->second}); + m_free_space.erase(next); + + return; + } + } + + // no free space adjacent to this chunk, add seperate free chunk [begin, + // end) + m_free_space.insert(next, free_value_type{begin, end}); + } + + void remove_free_chunk(free_type::iterator iter, void* begin, void* end) + { + + void* ptr = iter->first; + void* ptr_end = iter->second; + + // fixup m_free_space, shrinking and adding chunks as needed + if (ptr != begin) { + + // shrink end of current free region to [ptr, begin) + iter->second = begin; + + if (end != ptr_end) { + + // insert free region [end, ptr_end) after current free region + free_type::iterator next = iter; + ++next; + m_free_space.insert(next, free_value_type{end, ptr_end}); + } + + } else if (end != ptr_end) { + + // shrink beginning of current free region to [end, ptr_end) + free_type::iterator next = iter; + ++next; + m_free_space.insert(next, free_value_type{end, ptr_end}); + m_free_space.erase(iter); + + } else { + + // can not reuse current region, erase + m_free_space.erase(iter); + } + } + + void add_used_chunk(void* begin, void* end) + { + // simply inserts a chunk of memory into used_space + m_used_space.insert(used_value_type{begin, end}); + } + + memory_chunk m_allocation; + free_type m_free_space; + used_type m_used_space; +}; + +} /* end namespace detail */ + + +template +using MemPool = ::RAJA::basic_mempool::MemPool; + + +/*! \class LaggedMemPool + ****************************************************************************** + * + * \brief LaggedMemPool pre-allocates a large chunk of memory and provides generic + * malloc/free for the user to allocate aligned data within the pool + * + * LaggedMemPool uses MemoryArena to do the heavy lifting of maintaining access to + * the used/free space. + * + * LaggedMemPool provides an example generic_allocator which can guide more + *specialized + * allocators. The following are some examples + * + * using device_mempool_type = basic_mempool::LaggedMemPool; + * using device_zeroed_mempool_type = + *basic_mempool::LaggedMemPool; + * using pinned_mempool_type = basic_mempool::LaggedMemPool; + * + * The user provides the specialized allocator, for example : + * struct DeviceAllocator { + * + * // returns a valid pointer on success, nullptr on failure + * void* malloc(size_t nbytes) + * { + * void* ptr; + * cudaErrchk(cudaMalloc(&ptr, nbytes)); + * return ptr; + * } + * + * // returns true on success, false on failure + * bool free(void* ptr) + * { + * cudaErrchk(cudaFree(ptr)); + * return true; + * } + * }; + * + * + ****************************************************************************** + */ +template +class LaggedMemPool +{ +public: + using allocator_type = allocator_t; + + static inline LaggedMemPool& getInstance() + { + static LaggedMemPool pool{}; + return pool; + } + + static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull; + + LaggedMemPool(size_t default_arena_size = default_default_arena_size, + lagged_res const& lag_res = lagged_res::get_default()) + : m_arenas(), m_default_arena_size(default_arena_size), + m_alloc(), m_lagged_frees(), m_lag_res(lag_res) + { + } + + ~LaggedMemPool() + { + // This is here for the case that LaggedMemPool is used as a static object. + // If allocator_t uses cudaFree then it will fail with + // cudaErrorCudartUnloading when called after main. + } + + + void free_chunks() + { +#if defined(RAJA_ENABLE_OPENMP) + RAJA::lock_guard lock(m_mutex); +#endif + + while (!m_arenas.empty()) { + void* allocation_ptr = m_arenas.front().get_allocation(); + m_alloc.free(allocation_ptr); + m_arenas.pop_front(); + } + } + + size_t arena_size() + { +#if defined(RAJA_ENABLE_OPENMP) + RAJA::lock_guard lock(m_mutex); +#endif + + return m_default_arena_size; + } + + size_t arena_size(size_t new_size) + { +#if defined(RAJA_ENABLE_OPENMP) + RAJA::lock_guard lock(m_mutex); +#endif + + size_t prev_size = m_default_arena_size; + m_default_arena_size = new_size; + return prev_size; + } + + template + T* malloc(size_t nTs, size_t alignment = alignof(T)) + { +#if defined(RAJA_ENABLE_OPENMP) + RAJA::lock_guard lock(m_mutex); +#endif + + auto get_from_existing_arena = [&](size_t size, size_t alignment) { + void* ptr = nullptr; + for (detail::MemoryArena& arena : m_arenas) { + ptr = arena.get(size, alignment); + if (ptr != nullptr) { + break; + } + } + return ptr; + }; + + auto get_from_new_arena = [&](size_t size, size_t alignment) { + void* ptr = nullptr; + const size_t alloc_size = + std::max(size + alignment, m_default_arena_size); + void* arena_ptr = m_alloc.malloc(alloc_size); + if (arena_ptr != nullptr) { + m_arenas.emplace_front(arena_ptr, alloc_size); + ptr = m_arenas.front().get(size, alignment); + } + return ptr; + }; + + const size_t size = nTs * sizeof(T); + + void* ptr = get_from_existing_arena(size, alignment); + + if (ptr == nullptr) { + free_lagged_memory_impl(); + ptr = get_from_existing_arena(size, alignment); + } + + if (ptr == nullptr) { + ptr = get_from_new_arena(size, alignment); + } + + return static_cast(ptr); + } + + void free(const void* cptr) + { +#if defined(RAJA_ENABLE_OPENMP) + RAJA::lock_guard lock(m_mutex); +#endif + + m_lagged_frees.emplace_back(const_cast(cptr)); + } + + void free_lagged_memory() + { +#if defined(RAJA_ENABLE_OPENMP) + RAJA::lock_guard lock(m_mutex); +#endif + + free_lagged_memory_impl(); + } + +private: + using arena_container_type = std::list; + +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp::mutex m_mutex; +#endif + + arena_container_type m_arenas; + size_t m_default_arena_size; + allocator_t m_alloc; + std::vector m_lagged_frees; + lagged_res m_lag_res; + + + void free_lagged_memory_impl() + { + if (!m_lagged_frees.empty()) { + m_lag_res.wait(); + for (void* ptr : m_lagged_frees) { + for (detail::MemoryArena& arena : m_arenas) { + if (arena.give(ptr)) { + ptr = nullptr; + break; + } + } + if (ptr != nullptr) { + fprintf(stderr, "Unknown pointer %p", ptr); + } + } + m_lagged_frees.clear(); + } + } +}; + +} /* end namespace basic_mempool */ + +} /* end namespace rajaperf */ + + +#endif /* RAJAPerf_BASIC_MEMPOOL_HPP */ diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index dc9d0e20b..a6e3ca69b 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -75,6 +75,8 @@ #include "stream/MUL.hpp" #include "stream/ADD.hpp" #include "stream/TRIAD.hpp" +#include "stream/TRIAD_PARTED.hpp" +#include "stream/TRIAD_PARTED_FUSED.hpp" #include "stream/DOT.hpp" // @@ -226,6 +228,8 @@ static const std::string KernelNames [] = std::string("Stream_DOT"), std::string("Stream_MUL"), std::string("Stream_TRIAD"), + std::string("Stream_TRIAD_PARTED"), + std::string("Stream_TRIAD_PARTED_FUSED"), // // Apps kernels... @@ -897,6 +901,14 @@ KernelBase* getKernelObject(KernelID kid, kernel = new stream::TRIAD(run_params); break; } + case Stream_TRIAD_PARTED : { + kernel = new stream::TRIAD_PARTED(run_params); + break; + } + case Stream_TRIAD_PARTED_FUSED : { + kernel = new stream::TRIAD_PARTED_FUSED(run_params); + break; + } // // Apps kernels... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index a112a44d1..624f37aa6 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -136,6 +136,8 @@ enum KernelID { Stream_DOT, Stream_MUL, Stream_TRIAD, + Stream_TRIAD_PARTED, + Stream_TRIAD_PARTED_FUSED, // // Apps kernels... diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 061e143cf..fb95769f8 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -12,8 +12,13 @@ #include #include +#include #include +#include +#include +#include +#include #include #include @@ -38,6 +43,9 @@ RunParams::RunParams(int argc, char** argv) size(0.0), size_factor(0.0), data_alignment(RAJA::DATA_ALIGN), + num_parts(10), + part_type(PartType::Even), + part_size_order(PartSizeOrder::Random), gpu_stream(1), gpu_block_sizes(), mpi_size(1), @@ -91,6 +99,142 @@ RunParams::~RunParams() } + +/* + ******************************************************************************* + * + * Reorder partition boundaries based on params while preserving sizes. + * + ******************************************************************************* + */ +void RunParams::reorderPartitionSizes(std::vector& parts) const +{ + std::vector size_of_parts; + + size_of_parts.reserve(num_parts); + + for (size_t p = 1; p < parts.size(); ++p) { + size_of_parts.emplace_back(parts[p] - parts[p-1]); + } + + switch ( part_size_order ) { + case PartSizeOrder::Random: + { + std::mt19937 rng(parts.size()); // seed consistently + std::shuffle(size_of_parts.begin(), size_of_parts.end(), rng); + } break; + case PartSizeOrder::Ascending: + { + std::sort(size_of_parts.begin(), size_of_parts.end(), std::less{}); + } break; + case PartSizeOrder::Descending: + { + std::sort(size_of_parts.begin(), size_of_parts.end(), std::greater{}); + } break; + default: + { + getCout() << "RunParams::reorderPartitionSizes: unknown part_size_order" << std::endl; + } break; + } + + for (size_t p = 1; p < parts.size(); ++p) { + parts[p] = parts[p-1] + size_of_parts[p-1]; + } +} + +/* + ******************************************************************************* + * + * Get a partition boundaries based on params. + * + ******************************************************************************* + */ +std::vector RunParams::getPartition(Index_type len, Index_type num_parts) const +{ + std::vector parts; + + parts.reserve(num_parts+1); + + parts.emplace_back(0); + + switch ( (len > num_parts && num_parts > 1) + ? part_type : PartType::Even ) { + case PartType::Even: + { + for (Index_type p = 1; p < num_parts; ++p) { + + parts.emplace_back((len/num_parts)*p + + (len%num_parts)*p / num_parts); + } + } break; + + case PartType::Geometric: + { + auto geo_sum = [](double a, double r, double n) { + // sum of geometric series + // for i in [0, n), a*pow(r, i) + return a * (1.0 - std::pow(r, n)) / (1.0 - r); + }; + + auto geo_solve_for_r = [&](double sum, double a, double n) + { + double max_r = std::pow(sum/a, 1.0 / (n-1.0)); + double min_r = 1.0; + + double r = (max_r + min_r) / 2.0; + double diff = geo_sum(a, r, n) - sum; + + constexpr double tolerance = 1.0; + constexpr size_t max_iter = 1000; + + // use bisection to find r + for (size_t iter = 0; + iter < max_iter && (diff < 0.0 || diff > tolerance); + ++iter) { + + if (diff > 0.0) { + max_r = r; + } else { + min_r = r; + } + + r = (max_r + min_r) / 2.0; + diff = geo_sum(a, r, n) - sum; + } + + return r; + }; + + constexpr double a = 1.0; + double r = geo_solve_for_r(len, a, num_parts); + + for (Index_type p = 1; p < num_parts; ++p) { + + Index_type val = static_cast(std::floor(geo_sum(a, r, p))); + + if (val > 0 && val < len) { + parts.emplace_back(val); + } else { + getCout() << "RunParams::getPartition: Geometric failed to generate partition" << std::endl; + break; + } + } + + } break; + default: + { + getCout() << "RunParams::getPartition: unknown part_type" << std::endl; + } break; + } + + parts.emplace_back(len); + + reorderPartitionSizes(parts); + + return parts; +} + + /* ******************************************************************************* * @@ -119,6 +263,9 @@ void RunParams::print(std::ostream& str) const str << "\n size = " << size; str << "\n size_factor = " << size_factor; str << "\n data_alignment = " << data_alignment; + str << "\n num_parts = " << num_parts; + str << "\n part_type = " << PartTypeToStr(part_type); + str << "\n part_size_order = " << PartSizeOrderToStr(part_size_order); str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { @@ -433,6 +580,78 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--num_parts") ) { + + i++; + if ( i < argc ) { + long long num_parts_arg = ::atoll( argv[i] ); + if ( num_parts_arg < 1 ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << 1 + << std::endl; + input_state = BadInput; + } else { + num_parts = num_parts_arg; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--part_type") ) { + + bool got_someting = false; + i++; + if ( i < argc ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + } else { + for (int ipt = 0; ipt < static_cast(PartType::NumPartTypes); ++ipt) { + PartType pt = static_cast(ipt); + if (PartTypeToStr(pt) == opt) { + got_someting = true; + part_type = pt; + break; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give a valid partition type" + << std::endl; + input_state = BadInput; + } + } + } + + } else if ( opt == std::string("--part_size_order") ) { + + bool got_someting = false; + i++; + if ( i < argc ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + } else { + for (int ipso = 0; ipso < static_cast(PartSizeOrder::NumPartSizeOrders); ++ipso) { + PartSizeOrder pso = static_cast(ipso); + if (PartSizeOrderToStr(pso) == opt) { + got_someting = true; + part_size_order = pso; + break; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give a valid partition size order" + << std::endl; + input_state = BadInput; + } + } + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -1110,7 +1329,7 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t -et default library (exclude default and library tunings)\n\n"; str << "\t Options for selecting kernel data used in kernels....\n" - << "\t ======================================================\n\n";; + << "\t ======================================================\n\n"; str << "\t --data_alignment, -align [default is RAJA::DATA_ALIGN]\n" << "\t (minimum memory alignment for host allocations)\n" @@ -1118,6 +1337,24 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t -align 4096 (allocates memory aligned to 4KiB boundaries)\n\n"; + str << "\t --num_parts [default is 10]\n" + << "\t (number of parts for *_PARTED kernels)\n" + << "\t Must be at least 1.\n"; + str << "\t\t Example...\n" + << "\t\t --num_parts 100 (breaks *_PARTED kernels into 100 loops)\n\n"; + + str << "\t --part_type [default is Even]\n" + << "\t (distribution for parts in *_PARTED kernels).\n" + << "\t Valid partition types are 'Even' and 'Geometric'\n"; + str << "\t\t Example...\n" + << "\t\t --part_type Geometric (makes partitions with a fixed ratio of sizes)\n\n"; + + str << "\t --part_size_order [default is Random]\n" + << "\t (way to order partition sizes).\n" + << "\t Valid partition size orders are 'Random', 'Ascending', and 'Descending'\n"; + str << "\t\t Example...\n" + << "\t\t --part_size_order Ascending (sort partition sizes in ascending order)\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 10ae761a0..dd3cb80ae 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -97,6 +97,67 @@ class RunParams { } } + /*! + * \brief Enumeration indicating how to separate partitions + */ + enum PartType { + Even, /*!< all parts use same size */ + Geometric, /*!< part sizes are multiples of a single factor */ + + NumPartTypes + }; + + /*! + * \brief Translate PartType enum value to string + */ + static std::string PartTypeToStr(PartType pt) + { + switch (pt) { + case PartType::Even: + return "Even"; + case PartType::Geometric: + return "Geometric"; + case PartType::NumPartTypes: + default: + return "Unknown"; + } + } + + /*! + * \brief Enumeration indicating how to separate partitions + */ + enum PartSizeOrder { + Random, /*!< part sizes are ordered randomly (but consistently) */ + Ascending, /*!< part sizes are in ascending order */ + Descending, /*!< part sizes are in descending order */ + + NumPartSizeOrders + }; + + /*! + * \brief Translate PartSizeOrder enum value to string + */ + static std::string PartSizeOrderToStr(PartSizeOrder pt) + { + switch (pt) { + case PartSizeOrder::Random: + return "Random"; + case PartSizeOrder::Ascending: + return "Ascending"; + case PartSizeOrder::Descending: + return "Descending"; + case PartSizeOrder::NumPartSizeOrders: + default: + return "Unknown"; + } + } + + /*! + * \brief Get a partition from a length, number of partitions, and PartType enum value + * Note that the vector will be of length (num_part+1) + */ + std::vector getPartition(Index_type len, Index_type num_parts) const; + /*! * \brief Return state of input parsed to this point. */ @@ -123,6 +184,8 @@ class RunParams { Size_type getDataAlignment() const { return data_alignment; } + Index_type getNumParts() const { return num_parts; } + int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const @@ -199,6 +262,11 @@ class RunParams { private: RunParams() = delete; + /*! + * \brief Reorder partition sizes, used in getPartition. + */ + void reorderPartitionSizes(std::vector& parts) const; + //@{ //! @name Routines used in command line parsing and printing option output void parseCommandLineOptions(int argc, char** argv); @@ -231,9 +299,13 @@ class RunParams { SizeMeaning size_meaning; /*!< meaning of size value */ double size; /*!< kernel size to run (input option) */ - double size_factor; /*!< default kernel size multipier (input option) */ + double size_factor; /*!< default kernel size multiplier (input option) */ Size_type data_alignment; + Index_type num_parts; /*!< number of parts used in parted kernels (input option) */ + PartType part_type; /*!< how the partition sizes are generated (input option) */ + PartSizeOrder part_size_order; /*!< how the partition sizes are ordered (input option) */ + int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ int mpi_size; /*!< Number of MPI ranks */ diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt index b730791fc..c1f8241ae 100644 --- a/src/stream/CMakeLists.txt +++ b/src/stream/CMakeLists.txt @@ -38,5 +38,17 @@ blt_add_library( TRIAD-Cuda.cpp TRIAD-OMPTarget.cpp TRIAD-OMP.cpp + TRIAD_PARTED.cpp + TRIAD_PARTED-Seq.cpp + TRIAD_PARTED-Hip.cpp + TRIAD_PARTED-Cuda.cpp + TRIAD_PARTED-OMPTarget.cpp + TRIAD_PARTED-OMP.cpp + TRIAD_PARTED_FUSED.cpp + TRIAD_PARTED_FUSED-Seq.cpp + TRIAD_PARTED_FUSED-Hip.cpp + TRIAD_PARTED_FUSED-Cuda.cpp + TRIAD_PARTED_FUSED-OMPTarget.cpp + TRIAD_PARTED_FUSED-OMP.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/stream/TRIAD_PARTED-Cuda.cpp b/src/stream/TRIAD_PARTED-Cuda.cpp new file mode 100644 index 000000000..c6d2513b8 --- /dev/null +++ b/src/stream/TRIAD_PARTED-Cuda.cpp @@ -0,0 +1,580 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, + Index_type ibegin, Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x + ibegin; + if (i < iend) { + TRIAD_PARTED_BODY; + } +} + + +template < size_t block_size > +void TRIAD_PARTED::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + triad_parted<<>>( a, b, c, alpha, + ibegin, iend ); + cudaErrchk( cudaGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + lambda_cuda_forall<<>>( + ibegin, iend, [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + cudaErrchk( cudaGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED::runCudaVariantStream(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getCudaResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + triad_parted<<>>( a, b, c, alpha, + ibegin, iend ); + cudaErrchk( cudaGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + lambda_cuda_forall<<>>( + ibegin, iend, [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + cudaErrchk( cudaGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall< RAJA::cuda_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED::runCudaVariantStreamOpenmp(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getCudaResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + triad_parted<<>>( a, b, c, alpha, + ibegin, iend ); + cudaErrchk( cudaGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + lambda_cuda_forall<<>>( + ibegin, iend, [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + cudaErrchk( cudaGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall< RAJA::cuda_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl; + } +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +template < size_t block_size > +void TRIAD_PARTED::runCudaVariantStreamEvent(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getCudaResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + std::vector events(parts.size(), cudaEvent_t{}); + for (size_t p = 0; p < parts.size(); ++p ) { + cudaErrchk( cudaEventCreateWithFlags( &events[p], cudaEventDisableTiming ) ); + } + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) ); + + for (size_t p = 1; p < parts.size(); ++p ) { + cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + triad_parted<<>>( a, b, c, alpha, + ibegin, iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) ); + cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) ); + + for (size_t p = 1; p < parts.size(); ++p ) { + cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + lambda_cuda_forall<<>>( + ibegin, iend, [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) ); + cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + camp::resources::Event e0 = res[0].get_event_erased(); + + for (size_t p = 1; p < parts.size(); ++p ) { + res[p].wait_for(&e0); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + camp::resources::Event ep = RAJA::forall< RAJA::cuda_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + + res[0].wait_for(&ep); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl; + } + + for (size_t p = 0; p < parts.size(); ++p ) { + cudaErrchk( cudaEventDestroy( events[p] ) ); + } +} + +template < size_t block_size > +void TRIAD_PARTED::runCudaVariantStreamEventOpenmp(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getCudaResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + std::vector events(parts.size(), cudaEvent_t{}); + for (size_t p = 0; p < parts.size(); ++p ) { + cudaErrchk( cudaEventCreateWithFlags( &events[p], cudaEventDisableTiming ) ); + } + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) ); + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + triad_parted<<>>( a, b, c, alpha, + ibegin, iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) ); + cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaEventRecord( events[0], res[0].get_stream() ) ); + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + cudaErrchk( cudaStreamWaitEvent( res[p].get_stream(), events[0] ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + lambda_cuda_forall<<>>( + ibegin, iend, [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaEventRecord( events[p], res[p].get_stream() ) ); + cudaErrchk( cudaStreamWaitEvent( res[0].get_stream(), events[p] ) ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + camp::resources::Event e0 = res[0].get_event_erased(); + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + res[p].wait_for(&e0); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + camp::resources::Event ep = RAJA::forall< RAJA::cuda_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + + res[0].wait_for(&ep); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Cuda variant id = " << vid << std::endl; + } + + for (size_t p = 0; p < parts.size(); ++p ) { + cudaErrchk( cudaEventDestroy( events[p] ) ); + } +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +void TRIAD_PARTED::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantStream(vid); + + } + + t += 1; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantStreamOpenmp(vid); + + } + + t += 1; +#endif + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantStreamEvent(vid); + + } + + t += 1; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantStreamEventOpenmp(vid); + + } + + t += 1; +#endif + + } + + }); +} + +void TRIAD_PARTED::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "stream_"+std::to_string(block_size)); + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + addVariantTuningName(vid, "stream_omp_"+std::to_string(block_size)); +#endif + + addVariantTuningName(vid, "stream_event_"+std::to_string(block_size)); + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + addVariantTuningName(vid, "stream_event_omp_"+std::to_string(block_size)); +#endif + + } + + }); +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/stream/TRIAD_PARTED-Hip.cpp b/src/stream/TRIAD_PARTED-Hip.cpp new file mode 100644 index 000000000..e8e3ffa05 --- /dev/null +++ b/src/stream/TRIAD_PARTED-Hip.cpp @@ -0,0 +1,592 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, + Index_type ibegin, Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x + ibegin; + if (i < iend) { + TRIAD_PARTED_BODY; + } +} + + +template < size_t block_size > +void TRIAD_PARTED::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((triad_parted), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), a, b, c, alpha, + ibegin, iend ); + hipErrchk( hipGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_lambda = [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((lambda_hip_forall), + grid_size, block_size, shmem, res.get_stream(), ibegin, iend, triad_parted_lambda); + hipErrchk( hipGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED::runHipVariantStream(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getHipResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((triad_parted), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(), a, b, c, alpha, + ibegin, iend ); + hipErrchk( hipGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_lambda = [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((lambda_hip_forall), + grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda); + hipErrchk( hipGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall< RAJA::hip_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED::runHipVariantStreamOpenmp(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getHipResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((triad_parted), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(), a, b, c, alpha, + ibegin, iend ); + hipErrchk( hipGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_lambda = [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((lambda_hip_forall), + grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda); + hipErrchk( hipGetLastError() ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall< RAJA::hip_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl; + } +#else + RAJA_UNUSED_VAR(vid); +#endif +} + + + +template < size_t block_size > +void TRIAD_PARTED::runHipVariantStreamEvent(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getHipResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + std::vector events(parts.size(), hipEvent_t{}); + for (size_t p = 0; p < parts.size(); ++p ) { + hipErrchk( hipEventCreateWithFlags( &events[p], hipEventDisableTiming ) ); + } + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) ); + + for (size_t p = 1; p < parts.size(); ++p ) { + hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((triad_parted), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(), a, b, c, alpha, + ibegin, iend ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) ); + hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) ); + + for (size_t p = 1; p < parts.size(); ++p ) { + hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_lambda = [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((lambda_hip_forall), + grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) ); + hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + camp::resources::Event e0 = res[0].get_event_erased(); + + for (size_t p = 1; p < parts.size(); ++p ) { + res[p].wait_for(&e0); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + camp::resources::Event ep = RAJA::forall< RAJA::hip_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + + res[0].wait_for(&ep); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl; + } + + for (size_t p = 0; p < parts.size(); ++p ) { + hipErrchk( hipEventDestroy( events[p] ) ); + } +} + +template < size_t block_size > +void TRIAD_PARTED::runHipVariantStreamEventOpenmp(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + std::vector res; + res.reserve(parts.size()); + res.emplace_back(getHipResource()); + for (size_t p = 1; p < parts.size(); ++p ) { + res.emplace_back(p-1); + } + + std::vector events(parts.size(), hipEvent_t{}); + for (size_t p = 0; p < parts.size(); ++p ) { + hipErrchk( hipEventCreateWithFlags( &events[p], hipEventDisableTiming ) ); + } + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) ); + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((triad_parted), dim3(grid_size), dim3(block_size), shmem, res[p].get_stream(), a, b, c, alpha, + ibegin, iend ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) ); + hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) ); + } + + } + stopTimer(); + + } else if ( vid == Lambda_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipEventRecord( events[0], res[0].get_stream() ) ); + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + hipErrchk( hipStreamWaitEvent( res[p].get_stream(), events[0], 0 ) ); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_lambda = [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + hipLaunchKernelGGL((lambda_hip_forall), + grid_size, block_size, shmem, res[p].get_stream(), ibegin, iend, triad_parted_lambda); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipEventRecord( events[p], res[p].get_stream() ) ); + hipErrchk( hipStreamWaitEvent( res[0].get_stream(), events[p], 0 ) ); + } + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + camp::resources::Event e0 = res[0].get_event_erased(); + + #pragma omp parallel for default(shared) + for (size_t p = 1; p < parts.size(); ++p ) { + res[p].wait_for(&e0); + + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + camp::resources::Event ep = RAJA::forall< RAJA::hip_exec >( res[p], + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRIAD_PARTED_BODY; + }); + + res[0].wait_for(&ep); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown Hip variant id = " << vid << std::endl; + } + + for (size_t p = 0; p < parts.size(); ++p ) { + hipErrchk( hipEventDestroy( events[p] ) ); + } +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +void TRIAD_PARTED::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantStream(vid); + + } + + t += 1; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantStreamOpenmp(vid); + + } + + t += 1; +#endif + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantStreamEvent(vid); + + } + + t += 1; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantStreamEventOpenmp(vid); + + } + + t += 1; +#endif + + } + + }); +} + +void TRIAD_PARTED::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "stream_"+std::to_string(block_size)); + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + addVariantTuningName(vid, "stream_omp_"+std::to_string(block_size)); +#endif + + addVariantTuningName(vid, "stream_event_"+std::to_string(block_size)); + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + addVariantTuningName(vid, "stream_event_omp_"+std::to_string(block_size)); +#endif + + } + + }); +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/stream/TRIAD_PARTED-OMP.cpp b/src/stream/TRIAD_PARTED-OMP.cpp new file mode 100644 index 000000000..2366ca3e4 --- /dev/null +++ b/src/stream/TRIAD_PARTED-OMP.cpp @@ -0,0 +1,108 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void TRIAD_PARTED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + auto triad_parted_lam = [=](Index_type i) { + TRIAD_PARTED_BODY; + }; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + TRIAD_PARTED_BODY; + } + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + triad_parted_lam(i); + } + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), triad_parted_lam); + } + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n TRIAD_PARTED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream/TRIAD_PARTED-OMPTarget.cpp b/src/stream/TRIAD_PARTED-OMPTarget.cpp new file mode 100644 index 000000000..0733bc8b8 --- /dev/null +++ b/src/stream/TRIAD_PARTED-OMPTarget.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +void TRIAD_PARTED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + #pragma omp target is_device_ptr(a, b, c) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + TRIAD_PARTED_BODY; + } + } + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRIAD_PARTED_BODY; + }); + } + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP + diff --git a/src/stream/TRIAD_PARTED-Seq.cpp b/src/stream/TRIAD_PARTED-Seq.cpp new file mode 100644 index 000000000..b261de348 --- /dev/null +++ b/src/stream/TRIAD_PARTED-Seq.cpp @@ -0,0 +1,105 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void TRIAD_PARTED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_DATA_SETUP; + +#if defined(RUN_RAJA_SEQ) + auto triad_parted_lam = [=](Index_type i) { + TRIAD_PARTED_BODY; + }; +#endif + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + for (Index_type i = ibegin; i < iend; ++i ) { + TRIAD_PARTED_BODY; + } + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + for (Index_type i = ibegin; i < iend; ++i ) { + triad_parted_lam(i); + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), triad_parted_lam); + } + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n TRIAD_PARTED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream/TRIAD_PARTED.cpp b/src/stream/TRIAD_PARTED.cpp new file mode 100644 index 000000000..7ad85673e --- /dev/null +++ b/src/stream/TRIAD_PARTED.cpp @@ -0,0 +1,100 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +TRIAD_PARTED::TRIAD_PARTED(const RunParams& params) + : KernelBase(rajaperf::Stream_TRIAD_PARTED, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(1000); + + setActualProblemSize( getTargetProblemSize() ); + + const Index_type num_parts = std::min(params.getNumParts(), getActualProblemSize()); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1*num_parts); + setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * + getActualProblemSize() ); + setFLOPsPerRep(2 * getActualProblemSize()); + + checksum_scale_factor = 0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + m_parts.reserve(num_parts+1); + m_parts.emplace_back(0); + for (Index_type p = 1; p < num_parts; ++p) { + // use evenly spaced parts for now + m_parts.emplace_back((getActualProblemSize()/num_parts)*p + + (getActualProblemSize()%num_parts)*p / num_parts); + } + m_parts.emplace_back(getActualProblemSize()); + + setUsesFeature( Forall ); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( Lambda_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( Lambda_HIP ); + setVariantDefined( RAJA_HIP ); +} + +TRIAD_PARTED::~TRIAD_PARTED() +{ +} + +void TRIAD_PARTED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); + allocAndInitData(m_b, getActualProblemSize(), vid); + allocAndInitData(m_c, getActualProblemSize(), vid); + initData(m_alpha, vid); +} + +void TRIAD_PARTED::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor , vid); +} + +void TRIAD_PARTED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + deallocData(m_a, vid); + deallocData(m_b, vid); + deallocData(m_c, vid); +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream/TRIAD_PARTED.hpp b/src/stream/TRIAD_PARTED.hpp new file mode 100644 index 000000000..83625b733 --- /dev/null +++ b/src/stream/TRIAD_PARTED.hpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// TRIAD_PARTED kernel reference implementation: +/// +/// for (size_t p = 1; p < parts.size(); ++p ) { +/// Index_type ibegin = iparts[p-1]; +/// Index_type iend = iparts[p]; +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// a[i] = b[i] + alpha * c[i] ; +/// } +/// } +/// + +#ifndef RAJAPerf_Stream_TRIAD_PARTED_HPP +#define RAJAPerf_Stream_TRIAD_PARTED_HPP + +#define TRIAD_PARTED_DATA_SETUP \ + std::vector parts = m_parts; \ + \ + Real_ptr a = m_a; \ + Real_ptr b = m_b; \ + Real_ptr c = m_c; \ + Real_type alpha = m_alpha; + +#define TRIAD_PARTED_BODY \ + a[i] = b[i] + alpha * c[i] ; + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace stream +{ + +class TRIAD_PARTED : public KernelBase +{ +public: + + TRIAD_PARTED(const RunParams& params); + + ~TRIAD_PARTED(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runCudaVariantStream(VariantID vid); + template < size_t block_size > + void runHipVariantStream(VariantID vid); + template < size_t block_size > + void runCudaVariantStreamOpenmp(VariantID vid); + template < size_t block_size > + void runHipVariantStreamOpenmp(VariantID vid); + template < size_t block_size > + void runCudaVariantStreamEvent(VariantID vid); + template < size_t block_size > + void runHipVariantStreamEvent(VariantID vid); + template < size_t block_size > + void runCudaVariantStreamEventOpenmp(VariantID vid); + template < size_t block_size > + void runHipVariantStreamEventOpenmp(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + std::vector m_parts; + + Real_ptr m_a; + Real_ptr m_b; + Real_ptr m_c; + Real_type m_alpha; +}; + +} // end namespace stream +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/stream/TRIAD_PARTED_FUSED-Cuda.cpp b/src/stream/TRIAD_PARTED_FUSED-Cuda.cpp new file mode 100644 index 000000000..efcfbc1b5 --- /dev/null +++ b/src/stream/TRIAD_PARTED_FUSED-Cuda.cpp @@ -0,0 +1,976 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" +#include "common/MemPool.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted_graph(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, + Index_type ibegin, Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x + ibegin; + if (i < iend) { + TRIAD_PARTED_FUSED_BODY; + } +} + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_CUDA(vid) \ + Index_type* len_ptrs; \ + Real_ptr* a_ptrs; \ + Real_ptr* b_ptrs; \ + Real_ptr* c_ptrs; \ + Real_type* alpha_ptrs; \ + Index_type* ibegin_ptrs; \ + allocData(getFuserDataSpace(vid), len_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), a_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), b_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), c_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), alpha_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), ibegin_ptrs, parts.size()-1); + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_CUDA(vid) \ + deallocData(getFuserDataSpace(vid), len_ptrs); \ + deallocData(getFuserDataSpace(vid), a_ptrs); \ + deallocData(getFuserDataSpace(vid), b_ptrs); \ + deallocData(getFuserDataSpace(vid), c_ptrs); \ + deallocData(getFuserDataSpace(vid), alpha_ptrs); \ + deallocData(getFuserDataSpace(vid), ibegin_ptrs); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted_fused_soa(Index_type* len_ptrs, Real_ptr* a_ptrs, + Real_ptr* b_ptrs, Real_ptr* c_ptrs, + Real_type* alpha_ptrs, Index_type* ibegin_ptrs) +{ + Index_type j = blockIdx.y; + + Index_type len = len_ptrs[j]; + Real_ptr a = a_ptrs[j]; + Real_ptr b = b_ptrs[j]; + Real_ptr c = c_ptrs[j]; + Real_type alpha = alpha_ptrs[j]; + Index_type ibegin = ibegin_ptrs[j]; + + for (Index_type ii = threadIdx.x + blockIdx.x * block_size; + ii < len; + ii += block_size * gridDim.x) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } +} + + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(vid, num_holders) \ + triad_holder* triad_holders; \ + allocData(getFuserDataSpace(vid), triad_holders, (num_holders)); + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(vid) \ + deallocData(getFuserDataSpace(vid), triad_holders); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted_fused_aos(triad_holder* triad_holders) +{ + Index_type j = blockIdx.y; + + Index_type len = triad_holders[j].len; + Real_ptr a = triad_holders[j].a; + Real_ptr b = triad_holders[j].b; + Real_ptr c = triad_holders[j].c; + Real_type alpha = triad_holders[j].alpha; + Index_type ibegin = triad_holders[j].ibegin; + + for (Index_type ii = threadIdx.x + blockIdx.x * block_size; + ii < len; + ii += block_size * gridDim.x) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } +} + +using scan_index_type = RAJA::cuda_dim_member_t; +#define WARP_SIZE 32 +#define warp_shfl(...) __shfl_sync(0xffffffff, __VA_ARGS__) + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted_fused_scan_aos(scan_index_type* first_blocks, scan_index_type num_fused, + triad_holder* triad_holders) +{ + scan_index_type min_j = 0; + scan_index_type max_j = num_fused-1; + scan_index_type j = (min_j + max_j + 1) / 2; + scan_index_type first_block = first_blocks[j]; + while (min_j != max_j) { + if (first_block > blockIdx.x) { + max_j = j-1; + } else { + min_j = j; + } + j = (min_j + max_j + 1) / 2; + first_block = first_blocks[j]; + } + + Index_type len = triad_holders[j].len; + Real_ptr a = triad_holders[j].a; + Real_ptr b = triad_holders[j].b; + Real_ptr c = triad_holders[j].c; + Real_type alpha = triad_holders[j].alpha; + Index_type ibegin = triad_holders[j].ibegin; + + Index_type ii = threadIdx.x + (blockIdx.x - first_block) * block_size; + if (ii < len) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } +} + + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantGraphReuse(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + const size_t num_holders = parts.size()-1; + + cudaGraph_t graph; + cudaErrchk(cudaGraphCreate(&graph, 0)); + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); + constexpr size_t shmem = 0; + void* kernelArgs[] {(void*)&a, (void*)&b, (void*)&c, (void*)&alpha, (void*)&ibegin, (void*)&iend}; + + cudaKernelNodeParams params{}; + params.func = (void *)triad_parted_graph; + params.gridDim = dim3(grid_size); + params.blockDim = dim3(block_size); + params.sharedMemBytes = shmem; + params.kernelParams = kernelArgs; + + cudaGraphNode_t node; + cudaErrchk(cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms)); + } + + + cudaGraphExec_t graphexec; + { + constexpr size_t errbufsize = 256; + cudaGraphNode_t errnode = nullptr; + char errbuf[errbufsize]; + cudaErrchk(cudaGraphInstantiate(&graphexec, graph, &errnode, errbuf, errbufsize)); + } + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaGraphLaunch(graphexec, 0) ); + + } + stopTimer(); + + cudaErrchk(cudaGraphExecDestroy(graphexec)); + cudaErrchk(cudaGraphDestroy(graph)); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantSOA2dSync(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_CUDA(Base_CUDA) + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + len_ptrs[index] = iend-ibegin; + a_ptrs[index] = a; + b_ptrs[index] = b; + c_ptrs[index] = c; + alpha_ptrs[index] = alpha; + ibegin_ptrs[index] = ibegin; + len_sum += iend-ibegin; + index += 1; + } + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + triad_parted_fused_soa<<>>( + len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs); + cudaErrchk( cudaGetLastError() ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_CUDA(Base_CUDA) + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantSOA2dReuse(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_CUDA(Base_CUDA) + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + len_ptrs[index] = iend-ibegin; + a_ptrs[index] = a; + b_ptrs[index] = b; + c_ptrs[index] = c; + alpha_ptrs[index] = alpha; + ibegin_ptrs[index] = ibegin; + len_sum += iend-ibegin; + index += 1; + } + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + triad_parted_fused_soa<<>>( + len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs); + cudaErrchk( cudaGetLastError() ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_CUDA(Base_CUDA) + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantAOS2dSync(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + const size_t num_holders = parts.size()-1; + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders) + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + len_sum += iend-ibegin; + index += 1; + } + + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + triad_parted_fused_aos<<>>( + triad_holders); + cudaErrchk( cudaGetLastError() ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA) + + } else if ( vid == RAJA_CUDA ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + // RAJA::indirect_function_call_dispatch + // RAJA::indirect_virtual_function_dispatch + RAJA::direct_dispatch, decltype(triad_parted_fused_lam)>> + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + worksite site = group.run(res); + res.wait(); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantAOS2dPoolSync(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + const size_t pool_size = 32ull * 1024ull * 1024ull; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + const size_t num_holders = std::max(parts.size()-1, pool_size / sizeof(triad_holder)); + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders) + + Index_type holder_start = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + if (holder_start+parts.size()-1 > num_holders) { + // synchronize when have to reuse memory + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + holder_start = 0; + } + + Index_type num_fused = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[holder_start+num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + len_sum += iend-ibegin; + num_fused += 1; + } + + Index_type len_ave = (len_sum + num_fused-1) / num_fused; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, num_fused); + constexpr size_t shmem = 0; + triad_parted_fused_aos<<>>( + triad_holders+holder_start); + cudaErrchk( cudaGetLastError() ); + holder_start += num_fused; + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA) + + } else if ( vid == RAJA_CUDA ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::LaggedMemPool< + dataspace_allocator, camp::resources::Cuda>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder(pool_size, res); + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + // RAJA::indirect_function_call_dispatch + // RAJA::indirect_virtual_function_dispatch + RAJA::direct_dispatch, decltype(triad_parted_fused_lam)>> + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantAOS2dReuse(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + const size_t num_holders = parts.size()-1; + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders) + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + len_sum += iend-ibegin; + index += 1; + } + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + triad_parted_fused_aos<<>>( + triad_holders); + cudaErrchk( cudaGetLastError() ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA) + + } else if ( vid == RAJA_CUDA ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + // RAJA::indirect_function_call_dispatch + // RAJA::indirect_virtual_function_dispatch + RAJA::direct_dispatch, decltype(triad_parted_fused_lam)>> + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantAOS2dReuseFunctionPointer(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantAOS2dReuseVirtualFunction(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_virtual_function_dispatch + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runCudaVariantAOSScanReuse(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + const size_t num_holders = parts.size()-1; + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_CUDA(Base_CUDA, num_holders) + scan_index_type* first_blocks; + allocData(getFuserDataSpace(Base_CUDA), first_blocks, (num_holders)); + + Index_type num_fused = 0; + scan_index_type num_blocks = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + first_blocks[num_fused] = num_blocks; + num_blocks += (static_cast(iend-ibegin) + + static_cast(block_size)-1) / + static_cast(block_size); + num_fused += 1; + } + dim3 nthreads_per_block(block_size); + dim3 nblocks(num_blocks); + constexpr size_t shmem = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + triad_parted_fused_scan_aos<<>>( + first_blocks, num_fused, triad_holders); + cudaErrchk( cudaGetLastError() ); + + } + stopTimer(); + + deallocData(getFuserDataSpace(Base_CUDA), first_blocks); + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_CUDA(Base_CUDA) + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void TRIAD_PARTED_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantGraphReuse(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantSOA2dSync(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantSOA2dReuse(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAOSScanReuse(vid); + + } + + t += 1; + } + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAOS2dSync(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAOS2dPoolSync(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAOS2dReuse(vid); + + } + + t += 1; + + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAOS2dReuseFunctionPointer(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAOS2dReuseVirtualFunction(vid); + + } + + t += 1; + } + + } + + }); + + } else { + + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void TRIAD_PARTED_FUSED::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if ( vid == Base_CUDA ) { + addVariantTuningName(vid, "graph_reuse_"+std::to_string(block_size)); + + addVariantTuningName(vid, "SOA_2d_sync_"+std::to_string(block_size)); + + addVariantTuningName(vid, "SOA_2d_reuse_"+std::to_string(block_size)); + + addVariantTuningName(vid, "AOS_scan_reuse_"+std::to_string(block_size)); + } + + addVariantTuningName(vid, "AOS_2d_sync_"+std::to_string(block_size)); + + addVariantTuningName(vid, "AOS_2d_poolsync_"+std::to_string(block_size)); + + addVariantTuningName(vid, "AOS_2d_reuse_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { + + addVariantTuningName(vid, "AOS_2d_reuse_funcptr_"+std::to_string(block_size)); + addVariantTuningName(vid, "AOS_2d_reuse_virtfunc_"+std::to_string(block_size)); + + } + + } + + }); + + } +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/stream/TRIAD_PARTED_FUSED-Hip.cpp b/src/stream/TRIAD_PARTED_FUSED-Hip.cpp new file mode 100644 index 000000000..288a4c2d9 --- /dev/null +++ b/src/stream/TRIAD_PARTED_FUSED-Hip.cpp @@ -0,0 +1,894 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" +#include "common/MemPool.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_HIP(vid) \ + Index_type* len_ptrs; \ + Real_ptr* a_ptrs; \ + Real_ptr* b_ptrs; \ + Real_ptr* c_ptrs; \ + Real_type* alpha_ptrs; \ + Index_type* ibegin_ptrs; \ + allocData(getFuserDataSpace(vid), len_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), a_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), b_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), c_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), alpha_ptrs, parts.size()-1); \ + allocData(getFuserDataSpace(vid), ibegin_ptrs, parts.size()-1); + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_HIP(vid) \ + deallocData(getFuserDataSpace(vid), len_ptrs); \ + deallocData(getFuserDataSpace(vid), a_ptrs); \ + deallocData(getFuserDataSpace(vid), b_ptrs); \ + deallocData(getFuserDataSpace(vid), c_ptrs); \ + deallocData(getFuserDataSpace(vid), alpha_ptrs); \ + deallocData(getFuserDataSpace(vid), ibegin_ptrs); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted_fused_soa(Index_type* len_ptrs, Real_ptr* a_ptrs, + Real_ptr* b_ptrs, Real_ptr* c_ptrs, + Real_type* alpha_ptrs, Index_type* ibegin_ptrs) +{ + Index_type j = blockIdx.y; + + Index_type len = len_ptrs[j]; + Real_ptr a = a_ptrs[j]; + Real_ptr b = b_ptrs[j]; + Real_ptr c = c_ptrs[j]; + Real_type alpha = alpha_ptrs[j]; + Index_type ibegin = ibegin_ptrs[j]; + + for (Index_type ii = threadIdx.x + blockIdx.x * block_size; + ii < len; + ii += block_size * gridDim.x) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } +} + + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(vid, num_holders) \ + triad_holder* triad_holders; \ + allocData(getFuserDataSpace(vid), triad_holders, (num_holders)); + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(vid) \ + deallocData(getFuserDataSpace(vid), triad_holders); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted_fused_aos(triad_holder* triad_holders) +{ + Index_type j = blockIdx.y; + + Index_type len = triad_holders[j].len; + Real_ptr a = triad_holders[j].a; + Real_ptr b = triad_holders[j].b; + Real_ptr c = triad_holders[j].c; + Real_type alpha = triad_holders[j].alpha; + Index_type ibegin = triad_holders[j].ibegin; + + for (Index_type ii = threadIdx.x + blockIdx.x * block_size; + ii < len; + ii += block_size * gridDim.x) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } +} + +using scan_index_type = RAJA::hip_dim_member_t; +#define WARP_SIZE 64 +#define warp_shfl(...) __shfl(__VA_ARGS__) + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void triad_parted_fused_scan_aos(scan_index_type* first_blocks, scan_index_type num_fused, + triad_holder* triad_holders) +{ + scan_index_type min_j = 0; + scan_index_type max_j = num_fused-1; + scan_index_type j = (min_j + max_j + 1) / 2; + scan_index_type first_block = first_blocks[j]; + while (min_j != max_j) { + if (first_block > blockIdx.x) { + max_j = j-1; + } else { + min_j = j; + } + j = (min_j + max_j + 1) / 2; + first_block = first_blocks[j]; + } + + Index_type len = triad_holders[j].len; + Real_ptr a = triad_holders[j].a; + Real_ptr b = triad_holders[j].b; + Real_ptr c = triad_holders[j].c; + Real_type alpha = triad_holders[j].alpha; + Index_type ibegin = triad_holders[j].ibegin; + + Index_type ii = threadIdx.x + (blockIdx.x - first_block) * block_size; + if (ii < len) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } +} + + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantSOA2dSync(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_HIP(Base_HIP) + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + len_ptrs[index] = iend-ibegin; + a_ptrs[index] = a; + b_ptrs[index] = b; + c_ptrs[index] = c; + alpha_ptrs[index] = alpha; + ibegin_ptrs[index] = ibegin; + len_sum += iend-ibegin; + index += 1; + } + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + triad_parted_fused_soa<<>>( + len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs); + hipErrchk( hipGetLastError() ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_HIP(Base_HIP) + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantSOA2dReuse(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_SETUP_HIP(Base_HIP) + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + len_ptrs[index] = iend-ibegin; + a_ptrs[index] = a; + b_ptrs[index] = b; + c_ptrs[index] = c; + alpha_ptrs[index] = alpha; + ibegin_ptrs[index] = ibegin; + len_sum += iend-ibegin; + index += 1; + } + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + triad_parted_fused_soa<<>>( + len_ptrs, a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs); + hipErrchk( hipGetLastError() ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SOA_TEARDOWN_HIP(Base_HIP) + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantAOS2dSync(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + const size_t num_holders = parts.size()-1; + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders) + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + len_sum += iend-ibegin; + index += 1; + } + + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + triad_parted_fused_aos<<>>( + triad_holders); + hipErrchk( hipGetLastError() ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP) + + } else if ( vid == RAJA_HIP ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + // RAJA::indirect_function_call_dispatch + // RAJA::indirect_virtual_function_dispatch + RAJA::direct_dispatch, decltype(triad_parted_fused_lam)>> + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + worksite site = group.run(res); + res.wait(); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantAOS2dPoolSync(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + const size_t pool_size = 32ull * 1024ull * 1024ull; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + const size_t num_holders = std::max(parts.size()-1, pool_size / sizeof(triad_holder)); + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders) + + Index_type holder_start = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + if (holder_start+parts.size()-1 > num_holders) { + // synchronize when have to reuse memory + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + holder_start = 0; + } + + Index_type num_fused = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[holder_start+num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + len_sum += iend-ibegin; + num_fused += 1; + } + + Index_type len_ave = (len_sum + num_fused-1) / num_fused; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, num_fused); + constexpr size_t shmem = 0; + triad_parted_fused_aos<<>>( + triad_holders+holder_start); + hipErrchk( hipGetLastError() ); + holder_start += num_fused; + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP) + + } else if ( vid == RAJA_HIP ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::LaggedMemPool< + dataspace_allocator, camp::resources::Hip>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder(pool_size, res); + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + // RAJA::indirect_function_call_dispatch + // RAJA::indirect_virtual_function_dispatch + RAJA::direct_dispatch, decltype(triad_parted_fused_lam)>> + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantAOS2dReuse(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + const size_t num_holders = parts.size()-1; + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders) + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + len_sum += iend-ibegin; + index += 1; + } + Index_type len_ave = (len_sum + index-1) / index; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len_ave + block_size-1) / block_size, index); + constexpr size_t shmem = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + triad_parted_fused_aos<<>>( + triad_holders); + hipErrchk( hipGetLastError() ); + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP) + + } else if ( vid == RAJA_HIP ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + // RAJA::indirect_function_call_dispatch + // RAJA::indirect_virtual_function_dispatch + RAJA::direct_dispatch, decltype(triad_parted_fused_lam)>> + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantAOS2dReuseFunctionPointer(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch + // RAJA::indirect_virtual_function_dispatch + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantAOS2dReuseVirtualFunction(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + auto triad_parted_fused_lam = [=] __device__ (Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_virtual_function_dispatch + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + worksite site = group.run(res); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRIAD_PARTED_FUSED::runHipVariantAOSScanReuse(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + const size_t num_holders = parts.size()-1; + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_SETUP_HIP(Base_HIP, num_holders) + scan_index_type* first_blocks; + allocData(getFuserDataSpace(Base_HIP), first_blocks, (num_holders)); + + Index_type num_fused = 0; + scan_index_type num_blocks = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[num_fused] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + first_blocks[num_fused] = num_blocks; + num_blocks += (static_cast(iend-ibegin) + + static_cast(block_size)-1) / + static_cast(block_size); + num_fused += 1; + } + dim3 nthreads_per_block(block_size); + dim3 nblocks(num_blocks); + constexpr size_t shmem = 0; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + triad_parted_fused_scan_aos<<>>( + first_blocks, num_fused, triad_holders); + hipErrchk( hipGetLastError() ); + + } + stopTimer(); + + deallocData(getFuserDataSpace(Base_HIP), first_blocks); + TRIAD_PARTED_FUSED_MANUAL_FUSER_AOS_TEARDOWN_HIP(Base_HIP) + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +void TRIAD_PARTED_FUSED::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantSOA2dSync(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantSOA2dReuse(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAOSScanReuse(vid); + + } + + t += 1; + } + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAOS2dSync(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAOS2dPoolSync(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAOS2dReuse(vid); + + } + + t += 1; + + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAOS2dReuseFunctionPointer(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAOS2dReuseVirtualFunction(vid); + + } + + t += 1; + } + + } + + }); + + } else { + + getCout() << "\n TRIAD_PARTED_FUSED : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void TRIAD_PARTED_FUSED::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if ( vid == Base_HIP ) { + addVariantTuningName(vid, "SOA_2d_sync_"+std::to_string(block_size)); + + addVariantTuningName(vid, "SOA_2d_reuse_"+std::to_string(block_size)); + + addVariantTuningName(vid, "AOS_scan_reuse_"+std::to_string(block_size)); + } + + addVariantTuningName(vid, "AOS_2d_sync_"+std::to_string(block_size)); + + addVariantTuningName(vid, "AOS_2d_poolsync_"+std::to_string(block_size)); + + addVariantTuningName(vid, "AOS_2d_reuse_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { + + addVariantTuningName(vid, "AOS_2d_reuse_funcptr_"+std::to_string(block_size)); + addVariantTuningName(vid, "AOS_2d_reuse_virtfunc_"+std::to_string(block_size)); + + } + } + + }); + + } +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/stream/TRIAD_PARTED_FUSED-OMP.cpp b/src/stream/TRIAD_PARTED_FUSED-OMP.cpp new file mode 100644 index 000000000..73a818b6a --- /dev/null +++ b/src/stream/TRIAD_PARTED_FUSED-OMP.cpp @@ -0,0 +1,210 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/MemPool.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void TRIAD_PARTED_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_FUSED_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + index += 1; + } + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < index; j++) { + #pragma omp task firstprivate(j) + { + Index_type len = triad_holders[j].len; + Real_ptr a = triad_holders[j].a; + Real_ptr b = triad_holders[j].b; + Real_ptr c = triad_holders[j].c; + Real_type alpha = triad_holders[j].alpha; + Index_type ibegin = triad_holders[j].ibegin; + for (Index_type ii = 0; ii < len; ++ii ) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < index; j++) { + Index_type len = triad_holders[j].len; + Real_ptr a = triad_holders[j].a; + Real_ptr b = triad_holders[j].b; + Real_ptr c = triad_holders[j].c; + Real_type alpha = triad_holders[j].alpha; + Index_type ibegin = triad_holders[j].ibegin; + for (Index_type ii = 0; ii < len; ++ii ) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } + } +#endif + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN; + + break; + } + + case Lambda_OpenMP : { + + TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + new(&lambdas[index]) lambda_type(make_lambda(a, b, c, alpha, ibegin)); + lens[index] = iend-ibegin; + index += 1; + } + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < index; j++) { + #pragma omp task firstprivate(j) + { + auto lambda = lambdas[j]; + Index_type len = lens[j]; + for (Index_type ii = 0; ii < len; ii++) { + lambda(ii); + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < index; j++) { + auto lambda = lambdas[j]; + Index_type len = lens[j]; + for (Index_type ii = 0; ii < len; ii++) { + lambda(ii); + } + } +#endif + + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + + break; + } + + case RAJA_OpenMP : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool (allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_fused_lam = [=](Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + worksite site = group.run(); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream/TRIAD_PARTED_FUSED-OMPTarget.cpp b/src/stream/TRIAD_PARTED_FUSED-OMPTarget.cpp new file mode 100644 index 000000000..2eeee4f24 --- /dev/null +++ b/src/stream/TRIAD_PARTED_FUSED-OMPTarget.cpp @@ -0,0 +1,154 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" +#include "common/MemPool.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ + TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP \ + triad_holder* triad_holders; \ + allocData(DataSpace::OmpTarget, triad_holders, (parts.size()-1)); + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_COPY_OMP_TARGET \ + initOpenMPDeviceData(omp_triad_holders, triad_holders, index*sizeof(triad_holder)); + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ + deallocData(DataSpace::OmpTarget, triad_holders); \ + TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN + +void TRIAD_PARTED_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_FUSED_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + Index_type len_sum = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + len_sum += iend-ibegin; + index += 1; + } + + TRIAD_PARTED_FUSED_MANUAL_FUSER_COPY_OMP_TARGET; + Index_type len_ave = (len_sum + index-1) / index; + #pragma omp target is_device_ptr(a_ptrs, b_ptrs, c_ptrs, alpha_ptrs, ibegin_ptrs, len_ptrs) device( did ) + #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) + for (Index_type j = 0; j < index; j++) { + for (Index_type iii = 0; iii < len_ave; iii++) { + + Index_type len = omp_triad_holders[j].len; + Real_ptr a = omp_triad_holders[j].a; + Real_ptr b = omp_triad_holders[j].b; + Real_ptr c = omp_triad_holders[j].c; + Real_type alpha = omp_triad_holders[j].alpha; + Index_type ibegin = omp_triad_holders[j].ibegin; + + for (Index_type ii = iii; ii < len; ii += len_ave) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } + } + } + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_target_work /**/, + RAJA::ordered, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_fused_lam = [=](Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + worksite site = group.run(); + + } + stopTimer(); + + } else { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP + diff --git a/src/stream/TRIAD_PARTED_FUSED-Seq.cpp b/src/stream/TRIAD_PARTED_FUSED-Seq.cpp new file mode 100644 index 000000000..2def10ff0 --- /dev/null +++ b/src/stream/TRIAD_PARTED_FUSED-Seq.cpp @@ -0,0 +1,169 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/MemPool.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void TRIAD_PARTED_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + TRIAD_PARTED_FUSED_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + triad_holders[index] = triad_holder{iend-ibegin, a, b, c, alpha, ibegin}; + index += 1; + } + + for (Index_type j = 0; j < index; j++) { + Index_type len = triad_holders[j].len; + Real_ptr a = triad_holders[j].a; + Real_ptr b = triad_holders[j].b; + Real_ptr c = triad_holders[j].c; + Real_type alpha = triad_holders[j].alpha; + Index_type ibegin = triad_holders[j].ibegin; + for (Index_type ii = 0; ii < len; ++ii ) { + Index_type i = ii + ibegin; + TRIAD_PARTED_FUSED_BODY; + } + } + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN; + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type index = 0; + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + new(&lambdas[index]) lambda_type(make_lambda(a, b, c, alpha, ibegin)); + lens[index] = iend-ibegin; + index += 1; + } + + for (Index_type j = 0; j < index; j++) { + auto lambda = lambdas[j]; + Index_type len = lens[j]; + for (Index_type ii = 0; ii < len; ii++) { + lambda(ii); + } + } + + } + stopTimer(); + + TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + + break; + } + + case RAJA_Seq : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + rajaperf::basic_mempool::MemPool>>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::seq_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool(allocatorHolder.template getAllocator()); + pool.reserve(parts.size()-1, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (size_t p = 1; p < parts.size(); ++p ) { + const Index_type ibegin = parts[p-1]; + const Index_type iend = parts[p]; + + auto triad_parted_fused_lam = [=](Index_type i) { + TRIAD_PARTED_FUSED_BODY; + }; + + pool.enqueue( + RAJA::TypedRangeSegment(ibegin, iend), + triad_parted_fused_lam ); + } + workgroup group = pool.instantiate(); + worksite site = group.run(); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n TRIAD_PARTED_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream/TRIAD_PARTED_FUSED.cpp b/src/stream/TRIAD_PARTED_FUSED.cpp new file mode 100644 index 000000000..5c0011305 --- /dev/null +++ b/src/stream/TRIAD_PARTED_FUSED.cpp @@ -0,0 +1,91 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD_PARTED_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +TRIAD_PARTED_FUSED::TRIAD_PARTED_FUSED(const RunParams& params) + : KernelBase(rajaperf::Stream_TRIAD_PARTED_FUSED, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(1000); + + setActualProblemSize( getTargetProblemSize() ); + + const Index_type num_parts = std::min(params.getNumParts(), getActualProblemSize()); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * + getActualProblemSize() ); + setFLOPsPerRep(2 * getActualProblemSize()); + + checksum_scale_factor = 0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + m_parts = params.getPartition(getActualProblemSize(), num_parts); + + setUsesFeature( Workgroup ); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +TRIAD_PARTED_FUSED::~TRIAD_PARTED_FUSED() +{ +} + +void TRIAD_PARTED_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); + allocAndInitData(m_b, getActualProblemSize(), vid); + allocAndInitData(m_c, getActualProblemSize(), vid); + initData(m_alpha, vid); +} + +void TRIAD_PARTED_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor , vid); +} + +void TRIAD_PARTED_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + deallocData(m_a, vid); + deallocData(m_b, vid); + deallocData(m_c, vid); +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream/TRIAD_PARTED_FUSED.hpp b/src/stream/TRIAD_PARTED_FUSED.hpp new file mode 100644 index 000000000..26ef36aa3 --- /dev/null +++ b/src/stream/TRIAD_PARTED_FUSED.hpp @@ -0,0 +1,153 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// TRIAD_PARTED_FUSED kernel reference implementation: +/// +/// for (size_t p = 1; p < parts.size(); ++p ) { +/// Index_type ibegin = iparts[p-1]; +/// Index_type iend = iparts[p]; +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// a[i] = b[i] + alpha * c[i] ; +/// } +/// } +/// + +#ifndef RAJAPerf_Stream_TRIAD_PARTED_FUSED_HPP +#define RAJAPerf_Stream_TRIAD_PARTED_FUSED_HPP + +#define TRIAD_PARTED_FUSED_DATA_SETUP \ + std::vector parts = m_parts; \ + \ + Real_ptr a = m_a; \ + Real_ptr b = m_b; \ + Real_ptr c = m_c; \ + Real_type alpha = m_alpha; + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_SETUP \ + triad_holder* triad_holders = new triad_holder[parts.size()-1]; + +#define TRIAD_PARTED_FUSED_MANUAL_FUSER_TEARDOWN \ + delete[] triad_holders; + + +#define TRIAD_PARTED_FUSED_BODY \ + a[i] = b[i] + alpha * c[i] ; + + +#define TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ + auto make_lambda = [](Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, Index_type ibegin) { \ + return [=](Index_type ii) { \ + Index_type i = ii + ibegin; \ + TRIAD_PARTED_FUSED_BODY; \ + }; \ + }; \ + using lambda_type = decltype(make_lambda(Real_ptr(), Real_ptr(), Real_ptr(), Real_type(), Index_type())); \ + lambda_type* lambdas = reinterpret_cast( \ + malloc(sizeof(lambda_type) * (parts.size()-1))); \ + Index_type* lens = new Index_type[parts.size()-1]; + +#define TRIAD_PARTED_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ + free(lambdas); \ + delete[] lens; + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace stream +{ + +struct alignas(2*alignof(void*)) triad_holder { + Index_type len; + Real_ptr a; + Real_ptr b; + Real_ptr c; + Real_type alpha; + Index_type ibegin; +}; + +class TRIAD_PARTED_FUSED : public KernelBase +{ +public: + + TRIAD_PARTED_FUSED(const RunParams& params); + + ~TRIAD_PARTED_FUSED(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantGraphReuse(VariantID vid); + template < size_t block_size > + void runCudaVariantSOA2dSync(VariantID vid); + template < size_t block_size > + void runHipVariantSOA2dSync(VariantID vid); + template < size_t block_size > + void runCudaVariantSOA2dReuse(VariantID vid); + template < size_t block_size > + void runHipVariantSOA2dReuse(VariantID vid); + template < size_t block_size > + void runCudaVariantAOS2dSync(VariantID vid); + template < size_t block_size > + void runHipVariantAOS2dSync(VariantID vid); + template < size_t block_size > + void runCudaVariantAOS2dPoolSync(VariantID vid); + template < size_t block_size > + void runHipVariantAOS2dPoolSync(VariantID vid); + template < size_t block_size > + void runCudaVariantAOS2dReuse(VariantID vid); + template < size_t block_size > + void runHipVariantAOS2dReuse(VariantID vid); + template < size_t block_size > + void runCudaVariantAOS2dReuseFunctionPointer(VariantID vid); + template < size_t block_size > + void runHipVariantAOS2dReuseFunctionPointer(VariantID vid); + template < size_t block_size > + void runCudaVariantAOS2dReuseVirtualFunction(VariantID vid); + template < size_t block_size > + void runHipVariantAOS2dReuseVirtualFunction(VariantID vid); + template < size_t block_size > + void runCudaVariantAOSScanReuse(VariantID vid); + template < size_t block_size > + void runHipVariantAOSScanReuse(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + std::vector m_parts; + + Real_ptr m_a; + Real_ptr m_b; + Real_ptr m_c; + Real_type m_alpha; +}; + +} // end namespace stream +} // end namespace rajaperf + +#endif // closing endif for header file include guard