diff --git a/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf b/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf
index 1af6c82b44..918bd5795a 100644
Binary files a/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf and b/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf differ
diff --git a/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html b/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html
index b05ba20342..3129c6c6db 100644
--- a/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html
+++ b/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html
@@ -767,597 +767,611 @@
666 template <
typename D>
667 typename D::Type
value (D & reduce_data)
- 669 using ReduceTuple =
typename D::Type;
-
- 671 auto hp = reduce_data.hostPtr();
- 672 auto dp = reduce_data.devicePtr();
- 673 auto const& nblocks = reduce_data.nBlocks();
- 674 #if defined(AMREX_USE_SYCL)
- 675 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
- 676 const int N = nblocks[0];
-
-
-
-
-
-
-
- 684 for (
int i = 1; i < N; ++i) {
-
-
-
-
-
-
-
- 692 int maxblocks = reduce_data.maxBlocks();
- 693 #ifdef AMREX_USE_SYCL
-
- 695 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
- 696 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
-
-
- 699 auto presult = dtmp.
data();
-
-
-
- 703 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
-
-
-
-
-
- 709 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
- 710 auto dp_stream =
dp+istream*maxblocks;
- 711 for (
int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
- 712 i < nblocks[istream]; i += stride) {
-
-
-
-
- 717 if (gh.threadIdx() == 0) { *presult = dst; }
-
- 719 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
-
-
-
- 723 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
-
-
-
-
-
- 729 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
- 730 auto dp_stream =
dp+istream*maxblocks;
- 731 for (
int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
- 732 i < nblocks[istream]; i += stride) {
-
-
-
-
- 737 if (threadIdx.x == 0) { *hp = dst; }
-
-
-
-
-
-
-
-
-
-
- 748 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
- 749 T
Sum (N n, T
const* v, T init_val = 0)
-
-
-
- 753 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
- 755 ReduceTuple hv = reduce_data.
value(reduce_op);
- 756 return amrex::get<0>(hv) + init_val;
-
-
- 759 template <
typename T,
typename N,
typename F,
- 760 typename M=std::enable_if_t<std::is_integral<N>::value> >
- 761 T
Sum (N n, F&&
f, T init_val = 0)
-
-
-
- 765 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
- 767 ReduceTuple hv = reduce_data.
value(reduce_op);
- 768 return amrex::get<0>(hv) + init_val;
-
-
- 771 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
-
-
-
- 776 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
- 778 ReduceTuple hv = reduce_data.
value(reduce_op);
- 779 return std::min(amrex::get<0>(hv),init_val);
-
-
- 782 template <
typename T,
typename N,
typename F,
- 783 typename M=std::enable_if_t<std::is_integral<N>::value> >
-
-
-
-
- 788 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
- 790 ReduceTuple hv = reduce_data.
value(reduce_op);
- 791 return std::min(amrex::get<0>(hv),init_val);
-
-
- 794 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
- 795 T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
-
-
-
- 799 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
- 801 ReduceTuple hv = reduce_data.
value(reduce_op);
- 802 return std::max(amrex::get<0>(hv),init_val);
-
-
- 805 template <
typename T,
typename N,
typename F,
- 806 typename M=std::enable_if_t<std::is_integral<N>::value> >
- 807 T
Max (N n, F&&
f, T init_val = std::numeric_limits<T>::lowest())
-
-
-
- 811 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
- 813 ReduceTuple hv = reduce_data.
value(reduce_op);
- 814 return std::max(amrex::get<0>(hv),init_val);
-
-
- 817 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
-
-
-
- 822 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
-
-
- 826 auto hv = reduce_data.
value(reduce_op);
- 827 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
-
-
- 830 template <
typename T,
typename N,
typename F,
- 831 typename M=std::enable_if_t<std::is_integral<N>::value> >
-
-
-
-
- 836 using ReduceTuple =
typename decltype(reduce_data)::Type;
-
-
-
-
- 841 auto hv = reduce_data.
value(reduce_op);
- 842 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
-
-
- 845 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
-
-
-
-
-
- 852 ec.numBlocks.x =
std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
+ 669 auto hp = reduce_data.hostPtr();
+
+ 671 if (m_result_is_ready) {
+
+
+
+ 675 using ReduceTuple =
typename D::Type;
+
+ 677 auto dp = reduce_data.devicePtr();
+ 678 auto const& nblocks = reduce_data.nBlocks();
+ 679 #if defined(AMREX_USE_SYCL)
+ 680 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
+ 681 const int N = nblocks[0];
+
+
+
+
+
+
+ 688 for (
int i = 1; i < N; ++i) {
+
+
+
+
+
+
+
+ 696 int maxblocks = reduce_data.maxBlocks();
+ 697 #ifdef AMREX_USE_SYCL
+
+ 699 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
+ 700 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+
+
+ 703 auto presult = dtmp.
data();
+
+
+
+ 707 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
+
+
+
+
+
+ 713 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
+ 714 auto dp_stream =
dp+istream*maxblocks;
+ 715 for (
int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
+ 716 i < nblocks[istream]; i += stride) {
+
+
+
+
+ 721 if (gh.threadIdx() == 0) { *presult = dst; }
+
+ 723 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+
+
+
+ 727 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
+
+
+
+
+
+ 733 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
+ 734 auto dp_stream =
dp+istream*maxblocks;
+ 735 for (
int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+ 736 i < nblocks[istream]; i += stride) {
+
+
+
+
+ 741 if (threadIdx.x == 0) { *hp = dst; }
+
+
+
+
+
+ 747 m_result_is_ready =
true;
+
+
+
+
+ 752 bool m_result_is_ready =
false;
+
+
+
+
+ 757 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 758 T
Sum (N n, T
const* v, T init_val = 0)
+
+
+
+ 762 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+ 764 ReduceTuple hv = reduce_data.
value(reduce_op);
+ 765 return amrex::get<0>(hv) + init_val;
+
+
+ 768 template <
typename T,
typename N,
typename F,
+ 769 typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 770 T
Sum (N n, F&&
f, T init_val = 0)
+
+
+
+ 774 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+ 776 ReduceTuple hv = reduce_data.
value(reduce_op);
+ 777 return amrex::get<0>(hv) + init_val;
+
+
+ 780 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
+
+
+
+ 785 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+ 787 ReduceTuple hv = reduce_data.
value(reduce_op);
+ 788 return std::min(amrex::get<0>(hv),init_val);
+
+
+ 791 template <
typename T,
typename N,
typename F,
+ 792 typename M=std::enable_if_t<std::is_integral<N>::value> >
+
+
+
+
+ 797 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+ 799 ReduceTuple hv = reduce_data.
value(reduce_op);
+ 800 return std::min(amrex::get<0>(hv),init_val);
+
+
+ 803 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 804 T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
+
+
+
+ 808 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+ 810 ReduceTuple hv = reduce_data.
value(reduce_op);
+ 811 return std::max(amrex::get<0>(hv),init_val);
+
+
+ 814 template <
typename T,
typename N,
typename F,
+ 815 typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 816 T
Max (N n, F&&
f, T init_val = std::numeric_limits<T>::lowest())
+
+
+
+ 820 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+ 822 ReduceTuple hv = reduce_data.
value(reduce_op);
+ 823 return std::max(amrex::get<0>(hv),init_val);
+
+
+ 826 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
+
+
+
+ 831 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+
+
+ 835 auto hv = reduce_data.
value(reduce_op);
+ 836 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
+
+
+ 839 template <
typename T,
typename N,
typename F,
+ 840 typename M=std::enable_if_t<std::is_integral<N>::value> >
+
+
+
+
+ 845 using ReduceTuple =
typename decltype(reduce_data)::Type;
+
+
+
+
+ 850 auto hv = reduce_data.
value(reduce_op);
+ 851 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
+
- 854 #ifdef AMREX_USE_SYCL
- 855 const int num_ints =
std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
- 856 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
- 857 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes,
Gpu::gpuStream(),
-
- 859 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
- 860 if (gh.threadIdx() == 0) { *has_any = *
dp; }
-
+ 854 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
+
+
+
+
+
+ 861 ec.numBlocks.x =
std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
-
-
- 866 for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
- 867 i < n && !r; i += stride)
-
- 869 r = pred(v[i]) ? 1 : 0;
-
+ 863 #ifdef AMREX_USE_SYCL
+ 864 const int num_ints =
std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
+ 865 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
+ 866 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes,
Gpu::gpuStream(),
+
+ 868 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
+ 869 if (gh.threadIdx() == 0) { *has_any = *
dp; }
+
- 872 r = Gpu::blockReduce<Gpu::Device::warp_size>
-
- 874 if (gh.threadIdx() == 0 && r) { *
dp = 1; }
-
-
-
- 878 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0,
Gpu::gpuStream(),
-
- 880 __shared__
int has_any;
- 881 if (threadIdx.x == 0) has_any = *
dp;
-
-
-
-
-
- 887 for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
- 888 i < n && !r; i += stride)
-
- 890 r = pred(v[i]) ? 1 : 0;
-
- 892 r = Gpu::blockReduce<Gpu::Device::warp_size>
-
- 894 if (threadIdx.x == 0 && r) *
dp = 1;
-
-
-
-
-
-
- 901 template <
typename P>
-
-
-
-
-
- 907 int ncells = box.
numPts();
-
-
- 910 const auto lenxy = len.x*len.y;
- 911 const auto lenx = len.x;
-
- 913 ec.numBlocks.x =
std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
- 915 #ifdef AMREX_USE_SYCL
- 916 const int num_ints =
std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
- 917 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
- 918 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes,
Gpu::gpuStream(),
-
- 920 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
- 921 if (gh.threadIdx() == 0) { *has_any = *
dp; }
-
+
+
+
+ 875 for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+ 876 i < n && !r; i += stride)
+
+ 878 r = pred(v[i]) ? 1 : 0;
+
+
+ 881 r = Gpu::blockReduce<Gpu::Device::warp_size>
+
+ 883 if (gh.threadIdx() == 0 && r) { *
dp = 1; }
+
+
+
+ 887 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0,
Gpu::gpuStream(),
+
+ 889 __shared__
int has_any;
+ 890 if (threadIdx.x == 0) has_any = *
dp;
+
+
+
+
+
+ 896 for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+ 897 i < n && !r; i += stride)
+
+ 899 r = pred(v[i]) ? 1 : 0;
+
+ 901 r = Gpu::blockReduce<Gpu::Device::warp_size>
+
+ 903 if (threadIdx.x == 0 && r) *
dp = 1;
+
+
+
+
+
+
+ 910 template <
typename P>
+
+
+
+
+
+ 916 int ncells = box.
numPts();
+
+
+ 919 const auto lenxy = len.x*len.y;
+ 920 const auto lenx = len.x;
+
+ 922 ec.numBlocks.x =
std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
-
-
- 927 for (
int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
- 928 icell < ncells && !r; icell += stride) {
- 929 int k = icell / lenxy;
- 930 int j = (icell - k*lenxy) / lenx;
- 931 int i = (icell - k*lenxy) - j*lenx;
-
-
-
- 935 r = pred(i,j,k) ? 1 : 0;
-
- 937 r = Gpu::blockReduce<Gpu::Device::warp_size>
-
- 939 if (gh.threadIdx() == 0 && r) { *
dp = 1; }
-
-
-
-
-
-
- 946 __shared__
int has_any;
- 947 if (threadIdx.x == 0) has_any = *
dp;
-
-
-
-
-
- 953 for (
int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
- 954 icell < ncells && !r; icell += stride) {
- 955 int k = icell / lenxy;
- 956 int j = (icell - k*lenxy) / lenx;
- 957 int i = (icell - k*lenxy) - j*lenx;
-
-
-
- 961 r = pred(i,j,k) ? 1 : 0;
-
- 963 r = Gpu::blockReduce<Gpu::Device::warp_size>
-
- 965 if (threadIdx.x == 0 && r) *
dp = 1;
-
-
-
-
-
-
-
-
-
-
- 976 template <
typename... Ts>
-
-
-
- 980 using Type = GpuTuple<Ts...>;
-
- 982 template <
typename... Ps>
- 983 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
-
- 985 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
-
- 987 for (
auto& t : m_tuple) {
-
-
-
-
- 992 ~ReduceData () =
default;
- 993 ReduceData (ReduceData<Ts...>
const&) =
delete;
- 994 ReduceData (ReduceData<Ts...> &&) =
delete;
- 995 void operator= (ReduceData<Ts...>
const&) =
delete;
- 996 void operator= (ReduceData<Ts...> &&) =
delete;
-
- 998 Type value () {
return m_fn_value(); }
-
- 1000 template <
typename... Ps>
- 1001 Type value (ReduceOps<Ps...>& reduce_op)
-
- 1003 return reduce_op.value(*
this);
-
-
- 1006 Vector<Type>& reference () {
return m_tuple; }
-
- 1008 Type& reference (
int tid)
-
- 1010 if (m_tuple.size() == 1) {
-
-
-
- 1014 return m_tuple[tid];
-
-
-
-
- 1019 Vector<Type> m_tuple;
- 1020 std::function<Type()> m_fn_value;
-
-
- 1023 template <
typename... Ps>
-
-
-
-
- 1028 template <
typename D,
typename F>
-
- 1030 static auto call_f (Box
const& box,
typename D::Type & r, F
const&
f)
- 1031 noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(
f(0,0,0))>,
- 1032 typename D::Type>::value>
-
- 1034 using ReduceTuple =
typename D::Type;
-
-
- 1037 for (
int k = lo.z; k <= hi.z; ++k) {
- 1038 for (
int j = lo.y; j <= hi.y; ++j) {
- 1039 for (
int i = lo.x; i <= hi.x; ++i) {
-
-
-
-
- 1044 template <
typename D,
typename F>
-
- 1046 static auto call_f (Box
const& box,
typename D::Type & r, F
const&
f)
- 1047 noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(
f(Box()))>,
- 1048 typename D::Type>::value>
-
- 1050 using ReduceTuple =
typename D::Type;
-
-
-
-
-
- 1056 template <
typename MF,
typename D,
typename F>
- 1057 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
- 1058 eval (MF
const& mf, IntVect
const& nghost, D & reduce_data, F&&
f)
-
- 1060 using ReduceTuple =
typename D::Type;
- 1061 #ifdef AMREX_USE_OMP
- 1062 #pragma omp parallel
-
- 1064 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
- 1065 Box
const& b = mfi.growntilebox(nghost);
- 1066 const int li = mfi.LocalIndex();
-
-
-
- 1070 for (
int k = lo.z; k <= hi.z; ++k) {
- 1071 for (
int j = lo.y; j <= hi.y; ++j) {
- 1072 for (
int i = lo.x; i <= hi.x; ++i) {
-
-
-
-
-
- 1078 template <
typename MF,
typename D,
typename F>
- 1079 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
- 1080 eval (MF
const& mf, IntVect
const& nghost,
int ncomp, D & reduce_data, F&&
f)
-
- 1082 using ReduceTuple =
typename D::Type;
- 1083 #ifdef AMREX_USE_OMP
- 1084 #pragma omp parallel
-
- 1086 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
- 1087 Box
const& b = mfi.growntilebox(nghost);
- 1088 const int li = mfi.LocalIndex();
-
-
-
- 1092 for (
int n = 0; n < ncomp; ++n) {
- 1093 for (
int k = lo.z; k <= hi.z; ++k) {
- 1094 for (
int j = lo.y; j <= hi.y; ++j) {
- 1095 for (
int i = lo.x; i <= hi.x; ++i) {
-
-
-
-
-
- 1101 template <
typename D,
typename F>
- 1102 void eval (Box
const& box, D & reduce_data, F&&
f)
-
-
- 1105 call_f<D>(box, rr,
f);
-
-
- 1108 template <
typename N,
typename D,
typename F,
- 1109 typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1110 void eval (Box
const& box, N ncomp, D & reduce_data, F&&
f)
-
- 1112 using ReduceTuple =
typename D::Type;
+ 924 #ifdef AMREX_USE_SYCL
+ 925 const int num_ints =
std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
+ 926 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
+ 927 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes,
Gpu::gpuStream(),
+
+ 929 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
+ 930 if (gh.threadIdx() == 0) { *has_any = *
dp; }
+
+
+
+
+
+ 936 for (
int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+ 937 icell < ncells && !r; icell += stride) {
+ 938 int k = icell / lenxy;
+ 939 int j = (icell - k*lenxy) / lenx;
+ 940 int i = (icell - k*lenxy) - j*lenx;
+
+
+
+ 944 r = pred(i,j,k) ? 1 : 0;
+
+ 946 r = Gpu::blockReduce<Gpu::Device::warp_size>
+
+ 948 if (gh.threadIdx() == 0 && r) { *
dp = 1; }
+
+
+
+
+
+
+ 955 __shared__
int has_any;
+ 956 if (threadIdx.x == 0) has_any = *
dp;
+
+
+
+
+
+ 962 for (
int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+ 963 icell < ncells && !r; icell += stride) {
+ 964 int k = icell / lenxy;
+ 965 int j = (icell - k*lenxy) / lenx;
+ 966 int i = (icell - k*lenxy) - j*lenx;
+
+
+
+ 970 r = pred(i,j,k) ? 1 : 0;
+
+ 972 r = Gpu::blockReduce<Gpu::Device::warp_size>
+
+ 974 if (threadIdx.x == 0 && r) *
dp = 1;
+
+
+
+
+
+
+
+
+
+
+ 985 template <
typename... Ts>
+
+
+
+ 989 using Type = GpuTuple<Ts...>;
+
+ 991 template <
typename... Ps>
+ 992 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
+
+ 994 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
+
+ 996 for (
auto& t : m_tuple) {
+
+
+
+
+ 1001 ~ReduceData () =
default;
+ 1002 ReduceData (ReduceData<Ts...>
const&) =
delete;
+ 1003 ReduceData (ReduceData<Ts...> &&) =
delete;
+ 1004 void operator= (ReduceData<Ts...>
const&) =
delete;
+ 1005 void operator= (ReduceData<Ts...> &&) =
delete;
+
+ 1007 Type value () {
return m_fn_value(); }
+
+ 1009 template <
typename... Ps>
+ 1010 Type value (ReduceOps<Ps...>& reduce_op)
+
+ 1012 return reduce_op.value(*
this);
+
+
+ 1015 Vector<Type>& reference () {
return m_tuple; }
+
+ 1017 Type& reference (
int tid)
+
+ 1019 if (m_tuple.size() == 1) {
+
+
+
+ 1023 return m_tuple[tid];
+
+
+
+
+ 1028 Vector<Type> m_tuple;
+ 1029 std::function<Type()> m_fn_value;
+
+
+ 1032 template <
typename... Ps>
+
+
+
+
+ 1037 template <
typename D,
typename F>
+
+ 1039 static auto call_f (Box
const& box,
typename D::Type & r, F
const&
f)
+ 1040 noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(
f(0,0,0))>,
+ 1041 typename D::Type>::value>
+
+ 1043 using ReduceTuple =
typename D::Type;
+
+
+ 1046 for (
int k = lo.z; k <= hi.z; ++k) {
+ 1047 for (
int j = lo.y; j <= hi.y; ++j) {
+ 1048 for (
int i = lo.x; i <= hi.x; ++i) {
+
+
+
+
+ 1053 template <
typename D,
typename F>
+
+ 1055 static auto call_f (Box
const& box,
typename D::Type & r, F
const&
f)
+ 1056 noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(
f(Box()))>,
+ 1057 typename D::Type>::value>
+
+ 1059 using ReduceTuple =
typename D::Type;
+
+
+
+
+
+ 1065 template <
typename MF,
typename D,
typename F>
+ 1066 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
+ 1067 eval (MF
const& mf, IntVect
const& nghost, D & reduce_data, F&&
f)
+
+ 1069 using ReduceTuple =
typename D::Type;
+ 1070 #ifdef AMREX_USE_OMP
+ 1071 #pragma omp parallel
+
+ 1073 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
+ 1074 Box
const& b = mfi.growntilebox(nghost);
+ 1075 const int li = mfi.LocalIndex();
+
+
+
+ 1079 for (
int k = lo.z; k <= hi.z; ++k) {
+ 1080 for (
int j = lo.y; j <= hi.y; ++j) {
+ 1081 for (
int i = lo.x; i <= hi.x; ++i) {
+
+
+
+
+
+ 1087 template <
typename MF,
typename D,
typename F>
+ 1088 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
+ 1089 eval (MF
const& mf, IntVect
const& nghost,
int ncomp, D & reduce_data, F&&
f)
+
+ 1091 using ReduceTuple =
typename D::Type;
+ 1092 #ifdef AMREX_USE_OMP
+ 1093 #pragma omp parallel
+
+ 1095 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
+ 1096 Box
const& b = mfi.growntilebox(nghost);
+ 1097 const int li = mfi.LocalIndex();
+
+
+
+ 1101 for (
int n = 0; n < ncomp; ++n) {
+ 1102 for (
int k = lo.z; k <= hi.z; ++k) {
+ 1103 for (
int j = lo.y; j <= hi.y; ++j) {
+ 1104 for (
int i = lo.x; i <= hi.x; ++i) {
+
+
+
+
+
+ 1110 template <
typename D,
typename F>
+ 1111 void eval (Box
const& box, D & reduce_data, F&&
f)
+
-
-
- 1116 for (N n = 0; n < ncomp; ++n) {
- 1117 for (
int k = lo.z; k <= hi.z; ++k) {
- 1118 for (
int j = lo.y; j <= hi.y; ++j) {
- 1119 for (
int i = lo.x; i <= hi.x; ++i) {
-
-
-
-
- 1124 template <
typename N,
typename D,
typename F,
- 1125 typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1126 void eval (N n, D & reduce_data, F&&
f)
-
- 1128 using ReduceTuple =
typename D::Type;
-
- 1130 for (N i = 0; i < n; ++i) {
-
-
-
-
- 1135 template <
typename D>
- 1136 typename D::Type value (D & reduce_data)
-
- 1138 using ReduceTuple =
typename D::Type;
- 1139 auto& rrv = reduce_data.reference();
- 1140 if (rrv.size() > 1) {
- 1141 for (
int i = 1, N = rrv.size(); i < N; ++i) {
-
-
-
-
-
-
-
-
-
- 1151 template <
typename T,
typename N,
typename F,
- 1152 typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1153 T
Sum (N n, F&&
f, T init_val = 0)
-
-
- 1156 #ifdef AMREX_USE_OMP
- 1157 #pragma omp parallel for reduction(+:r)
-
- 1159 for (N i = 0; i < n; ++i) {
-
-
-
-
+ 1114 call_f<D>(box, rr,
f);
+
+
+ 1117 template <
typename N,
typename D,
typename F,
+ 1118 typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1119 void eval (Box
const& box, N ncomp, D & reduce_data, F&&
f)
+
+ 1121 using ReduceTuple =
typename D::Type;
+
+
+
+ 1125 for (N n = 0; n < ncomp; ++n) {
+ 1126 for (
int k = lo.z; k <= hi.z; ++k) {
+ 1127 for (
int j = lo.y; j <= hi.y; ++j) {
+ 1128 for (
int i = lo.x; i <= hi.x; ++i) {
+
+
+
+
+ 1133 template <
typename N,
typename D,
typename F,
+ 1134 typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1135 void eval (N n, D & reduce_data, F&&
f)
+
+ 1137 using ReduceTuple =
typename D::Type;
+
+ 1139 for (N i = 0; i < n; ++i) {
+
+
+
+
+ 1144 template <
typename D>
+ 1145 typename D::Type value (D & reduce_data)
+
+ 1147 auto& rrv = reduce_data.reference();
+ 1148 if (! m_result_is_ready) {
+ 1149 using ReduceTuple =
typename D::Type;
+ 1150 if (rrv.size() > 1) {
+ 1151 for (
int i = 1, N = rrv.size(); i < N; ++i) {
+
+
+
+ 1155 m_result_is_ready =
true;
+
+
+
+
+ 1160 bool m_result_is_ready =
false;
+
+
+
- 1165 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1166 T
Sum (N n, T
const* v, T init_val = 0)
-
- 1168 return Sum(n, [=] (N i) -> T {
return v[i]; }, init_val);
-
-
- 1171 template <
typename T,
typename N,
typename F,
- 1172 typename M=std::enable_if_t<std::is_integral<N>::value> >
-
-
-
- 1176 #ifdef AMREX_USE_OMP
- 1177 #pragma omp parallel for reduction(min:r)
-
- 1179 for (N i = 0; i < n; ++i) {
-
-
-
+ 1165 template <
typename T,
typename N,
typename F,
+ 1166 typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1167 T
Sum (N n, F&&
f, T init_val = 0)
+
+
+ 1170 #ifdef AMREX_USE_OMP
+ 1171 #pragma omp parallel for reduction(+:r)
+
+ 1173 for (N i = 0; i < n; ++i) {
+
+
+
+
+
+ 1179 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1180 T
Sum (N n, T
const* v, T init_val = 0)
+
+ 1182 return Sum(n, [=] (N i) -> T {
return v[i]; }, init_val);
- 1185 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
-
- 1188 return Reduce::Min(n, [=] (N i) -> T {
return v[i]; }, init_val);
-
-
- 1191 template <
typename T,
typename N,
typename F,
- 1192 typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1193 T
Max (N n, F&&
f, T init_val = std::numeric_limits<T>::lowest())
-
-
- 1196 #ifdef AMREX_USE_OMP
- 1197 #pragma omp parallel for reduction(max:r)
-
- 1199 for (N i = 0; i < n; ++i) {
-
-
-
+ 1185 template <
typename T,
typename N,
typename F,
+ 1186 typename M=std::enable_if_t<std::is_integral<N>::value> >
+
+
+
+ 1190 #ifdef AMREX_USE_OMP
+ 1191 #pragma omp parallel for reduction(min:r)
+
+ 1193 for (N i = 0; i < n; ++i) {
+
+
+
+
+
+ 1199 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
+
+ 1202 return Reduce::Min(n, [=] (N i) -> T {
return v[i]; }, init_val);
- 1205 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1206 T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
-
- 1208 return Reduce::Max(n, [=] (N i) -> T {
return v[i]; }, init_val);
-
-
- 1211 template <
typename T,
typename N,
typename F,
- 1212 typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1213 std::pair<T,T>
Min (N n, F&&
f)
-
-
- 1216 T r_max = std::numeric_limits<T>::lowest();
- 1217 #ifdef AMREX_USE_OMP
- 1218 #pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
-
- 1220 for (N i = 0; i < n; ++i) {
-
-
-
-
- 1225 return std::make_pair(r_min,r_max);
-
-
- 1228 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1229 std::pair<T,T>
MinMax (N n, T
const* v)
-
- 1231 return Reduce::MinMax<T>(n, [=] (N i) -> T {
return v[i]; });
-
-
- 1234 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
- 1235 bool AnyOf (N n, T
const* v,
P&& pred)
-
- 1237 return std::any_of(v, v+n, pred);
-
-
- 1240 template <
typename P>
- 1241 bool AnyOf (Box
const& box,
P&&pred)
-
-
-
- 1245 for (
int k = lo.z; k <= hi.z; ++k) {
- 1246 for (
int j = lo.y; j <= hi.y; ++j) {
- 1247 for (
int i = lo.x; i <= hi.x; ++i) {
- 1248 if (pred(i,j,k))
return true;
-
-
-
-
-
-
-
-
-
-
-
+ 1205 template <
typename T,
typename N,
typename F,
+ 1206 typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1207 T
Max (N n, F&&
f, T init_val = std::numeric_limits<T>::lowest())
+
+
+ 1210 #ifdef AMREX_USE_OMP
+ 1211 #pragma omp parallel for reduction(max:r)
+
+ 1213 for (N i = 0; i < n; ++i) {
+
+
+
+
+
+ 1219 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1220 T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
+
+ 1222 return Reduce::Max(n, [=] (N i) -> T {
return v[i]; }, init_val);
+
+
+ 1225 template <
typename T,
typename N,
typename F,
+ 1226 typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1227 std::pair<T,T>
Min (N n, F&&
f)
+
+
+ 1230 T r_max = std::numeric_limits<T>::lowest();
+ 1231 #ifdef AMREX_USE_OMP
+ 1232 #pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
+
+ 1234 for (N i = 0; i < n; ++i) {
+
+
+
+
+ 1239 return std::make_pair(r_min,r_max);
+
+
+ 1242 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1243 std::pair<T,T>
MinMax (N n, T
const* v)
+
+ 1245 return Reduce::MinMax<T>(n, [=] (N i) -> T {
return v[i]; });
+
+
+ 1248 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
+ 1249 bool AnyOf (N n, T
const* v,
P&& pred)
+
+ 1251 return std::any_of(v, v+n, pred);
+
+
+ 1254 template <
typename P>
+ 1255 bool AnyOf (Box
const& box,
P&&pred)
+
+
+
+ 1259 for (
int k = lo.z; k <= hi.z; ++k) {
+ 1260 for (
int j = lo.y; j <= hi.y; ++j) {
+ 1261 for (
int i = lo.x; i <= hi.x; ++i) {
+ 1262 if (pred(i,j,k))
return true;
+
+
+
+
+
+
+
+
+
+
+
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:116
#define AMREX_GPU_MAX_STREAMS
Definition: AMReX_GpuDevice.H:19
@@ -1431,11 +1445,11 @@
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:77
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:38
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:55
-T Max(N n, F &&f, T init_val=std::numeric_limits< T >::lowest())
Definition: AMReX_Reduce.H:807
-T Sum(N n, F &&f, T init_val=0)
Definition: AMReX_Reduce.H:761
-bool AnyOf(Box const &box, P &&pred)
Definition: AMReX_Reduce.H:902
-T Min(N n, F &&f, T init_val=std::numeric_limits< T >::max())
Definition: AMReX_Reduce.H:784
-std::pair< T, T > MinMax(N n, F &&f)
Definition: AMReX_Reduce.H:832
+T Max(N n, F &&f, T init_val=std::numeric_limits< T >::lowest())
Definition: AMReX_Reduce.H:816
+T Sum(N n, F &&f, T init_val=0)
Definition: AMReX_Reduce.H:770
+bool AnyOf(Box const &box, P &&pred)
Definition: AMReX_Reduce.H:911
+T Min(N n, F &&f, T init_val=std::numeric_limits< T >::max())
Definition: AMReX_Reduce.H:793
+std::pair< T, T > MinMax(N n, F &&f)
Definition: AMReX_Reduce.H:841
static int f(realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:40
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
diff --git a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html
index 098bae81d8..84b74e9aae 100644
--- a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html
+++ b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html
@@ -108,7 +108,8 @@
eval(Box const &box, N ncomp, D &reduce_data, F &&f) | amrex::ReduceOps< Ps > | inline |
eval(N n, D &reduce_data, F &&f) | amrex::ReduceOps< Ps > | inline |
eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f) | amrex::ReduceOps< Ps > | inline |
- value(D &reduce_data) | amrex::ReduceOps< Ps > | inline |
+ m_result_is_ready | amrex::ReduceOps< Ps > | private |
+ value(D &reduce_data) | amrex::ReduceOps< Ps > | inline |
diff --git a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html
index fa30b2a7a0..f7c59a89db 100644
--- a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html
+++ b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html
@@ -97,6 +97,7 @@
+
+
+
+◆ m_result_is_ready
+
+
+
+
+template<typename... Ps>
+
+
+
The documentation for this class was generated from the following file:
diff --git a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.js b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.js
index 85ef56038b..c774286e2c 100644
--- a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.js
+++ b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.js
@@ -6,5 +6,6 @@ var classamrex_1_1ReduceOps =
[ "eval", "classamrex_1_1ReduceOps.html#ab6d0e3293d5341c89aad5b8acc8abd47", null ],
[ "eval", "classamrex_1_1ReduceOps.html#a65284f038ea29beb80b99c1a5f246f04", null ],
[ "eval_mf", "classamrex_1_1ReduceOps.html#ad63e04436117908210c3ddd0e16973d4", null ],
- [ "value", "classamrex_1_1ReduceOps.html#a05d9d8beb406be38812442c318e5d434", null ]
+ [ "value", "classamrex_1_1ReduceOps.html#a05d9d8beb406be38812442c318e5d434", null ],
+ [ "m_result_is_ready", "classamrex_1_1ReduceOps.html#a4e60cd020808a640d7de13b866728202", null ]
];
\ No newline at end of file
diff --git a/amrex/docs_html/doxygen/functions_a.html b/amrex/docs_html/doxygen/functions_a.html
index 651af44f37..df6accc3f7 100644
--- a/amrex/docs_html/doxygen/functions_a.html
+++ b/amrex/docs_html/doxygen/functions_a.html
@@ -118,7 +118,7 @@ - a -