diff --git a/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf b/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf index 1af6c82b44..918bd5795a 100644 Binary files a/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf and b/amrex/docs_html/_downloads/008eb6dbfab802633dff40122ece848c/amrex.pdf differ diff --git a/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html b/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html index b05ba20342..3129c6c6db 100644 --- a/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html +++ b/amrex/docs_html/doxygen/AMReX__Reduce_8H_source.html @@ -767,597 +767,611 @@
666  template <typename D>
667  typename D::Type value (D & reduce_data)
668  {
-
669  using ReduceTuple = typename D::Type;
-
670  auto const& stream = Gpu::gpuStream();
-
671  auto hp = reduce_data.hostPtr();
-
672  auto dp = reduce_data.devicePtr();
-
673  auto const& nblocks = reduce_data.nBlocks();
-
674 #if defined(AMREX_USE_SYCL)
-
675  if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
-
676  const int N = nblocks[0];
-
677  if (N == 0) {
-
678  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
-
679  return *hp;
-
680  } else {
-
681  Gpu::PinnedVector<ReduceTuple> tmp(N);
-
682  Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
-
683  Gpu::streamSynchronize();
-
684  for (int i = 1; i < N; ++i) {
-
685  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
-
686  }
-
687  return tmp[0];
-
688  }
-
689  } else
-
690 #endif
-
691  {
-
692  int maxblocks = reduce_data.maxBlocks();
-
693 #ifdef AMREX_USE_SYCL
-
694  // device reduce needs local(i.e., shared) memory
-
695  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-
696 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
-
697  // xxxxx SYCL todo: reduce bug workaround
-
698  Gpu::DeviceVector<ReduceTuple> dtmp(1);
-
699  auto presult = dtmp.data();
-
700 #else
-
701  auto presult = hp;
-
702 #endif
-
703  amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
-
704  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
-
705  {
-
706  ReduceTuple r;
-
707  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-
708  ReduceTuple dst = r;
-
709  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
-
710  auto dp_stream = dp+istream*maxblocks;
-
711  for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
-
712  i < nblocks[istream]; i += stride) {
-
713  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
-
714  }
-
715  }
-
716  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-
717  if (gh.threadIdx() == 0) { *presult = dst; }
-
718  });
-
719 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
-
720  Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
-
721 #endif
-
722 #else
-
723  amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
-
724  [=] AMREX_GPU_DEVICE () noexcept
-
725  {
-
726  ReduceTuple r;
-
727  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-
728  ReduceTuple dst = r;
-
729  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
-
730  auto dp_stream = dp+istream*maxblocks;
-
731  for (int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-
732  i < nblocks[istream]; i += stride) {
-
733  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
-
734  }
-
735  }
-
736  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-
737  if (threadIdx.x == 0) { *hp = dst; }
-
738  });
-
739 #endif
-
740  Gpu::streamSynchronize();
-
741  return *hp;
-
742  }
-
743  }
-
744 };
-
745 
-
746 namespace Reduce {
-
747 
-
748 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
749 T Sum (N n, T const* v, T init_val = 0)
-
750 {
-
751  ReduceOps<ReduceOpSum> reduce_op;
-
752  ReduceData<T> reduce_data(reduce_op);
-
753  using ReduceTuple = typename decltype(reduce_data)::Type;
-
754  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
-
755  ReduceTuple hv = reduce_data.value(reduce_op);
-
756  return amrex::get<0>(hv) + init_val;
-
757 }
-
758 
-
759 template <typename T, typename N, typename F,
-
760  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
761 T Sum (N n, F&& f, T init_val = 0)
-
762 {
-
763  ReduceOps<ReduceOpSum> reduce_op;
-
764  ReduceData<T> reduce_data(reduce_op);
-
765  using ReduceTuple = typename decltype(reduce_data)::Type;
-
766  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
-
767  ReduceTuple hv = reduce_data.value(reduce_op);
-
768  return amrex::get<0>(hv) + init_val;
-
769 }
-
770 
-
771 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
772 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
-
773 {
-
774  ReduceOps<ReduceOpMin> reduce_op;
-
775  ReduceData<T> reduce_data(reduce_op);
-
776  using ReduceTuple = typename decltype(reduce_data)::Type;
-
777  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
-
778  ReduceTuple hv = reduce_data.value(reduce_op);
-
779  return std::min(amrex::get<0>(hv),init_val);
-
780 }
-
781 
-
782 template <typename T, typename N, typename F,
-
783  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
784 T Min (N n, F&& f, T init_val = std::numeric_limits<T>::max())
-
785 {
-
786  ReduceOps<ReduceOpMin> reduce_op;
-
787  ReduceData<T> reduce_data(reduce_op);
-
788  using ReduceTuple = typename decltype(reduce_data)::Type;
-
789  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
-
790  ReduceTuple hv = reduce_data.value(reduce_op);
-
791  return std::min(amrex::get<0>(hv),init_val);
-
792 }
-
793 
-
794 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
795 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
-
796 {
-
797  ReduceOps<ReduceOpMax> reduce_op;
-
798  ReduceData<T> reduce_data(reduce_op);
-
799  using ReduceTuple = typename decltype(reduce_data)::Type;
-
800  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
-
801  ReduceTuple hv = reduce_data.value(reduce_op);
-
802  return std::max(amrex::get<0>(hv),init_val);
-
803 }
-
804 
-
805 template <typename T, typename N, typename F,
-
806  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
807 T Max (N n, F&& f, T init_val = std::numeric_limits<T>::lowest())
-
808 {
-
809  ReduceOps<ReduceOpMax> reduce_op;
-
810  ReduceData<T> reduce_data(reduce_op);
-
811  using ReduceTuple = typename decltype(reduce_data)::Type;
-
812  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
-
813  ReduceTuple hv = reduce_data.value(reduce_op);
-
814  return std::max(amrex::get<0>(hv),init_val);
-
815 }
-
816 
-
817 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
818 std::pair<T,T> MinMax (N n, T const* v)
-
819 {
-
820  ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;
-
821  ReduceData<T,T> reduce_data(reduce_op);
-
822  using ReduceTuple = typename decltype(reduce_data)::Type;
-
823  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
-
824  return {v[i],v[i]};
-
825  });
-
826  auto hv = reduce_data.value(reduce_op);
-
827  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
-
828 }
-
829 
-
830 template <typename T, typename N, typename F,
-
831  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
832 std::pair<T,T> MinMax (N n, F&& f)
-
833 {
-
834  ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;
-
835  ReduceData<T,T> reduce_data(reduce_op);
-
836  using ReduceTuple = typename decltype(reduce_data)::Type;
-
837  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
-
838  T tmp = f(i);
-
839  return {tmp,tmp};
-
840  });
-
841  auto hv = reduce_data.value(reduce_op);
-
842  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
-
843 }
-
844 
-
845 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
846 bool AnyOf (N n, T const* v, P&& pred)
-
847 {
-
848  Gpu::LaunchSafeGuard lsg(true);
-
849  Gpu::DeviceScalar<int> ds(0);
-
850  int* dp = ds.dataPtr();
-
851  auto ec = Gpu::ExecutionConfig(n);
-
852  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
+
669  auto hp = reduce_data.hostPtr();
+
670 
+
671  if (m_result_is_ready) {
+
672  return *hp;
+
673  }
+
674 
+
675  using ReduceTuple = typename D::Type;
+
676  auto const& stream = Gpu::gpuStream();
+
677  auto dp = reduce_data.devicePtr();
+
678  auto const& nblocks = reduce_data.nBlocks();
+
679 #if defined(AMREX_USE_SYCL)
+
680  if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
+
681  const int N = nblocks[0];
+
682  if (N == 0) {
+
683  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
+
684  } else {
+
685  Gpu::PinnedVector<ReduceTuple> tmp(N);
+
686  Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
+
687  Gpu::streamSynchronize();
+
688  for (int i = 1; i < N; ++i) {
+
689  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
+
690  }
+
691  *hp = tmp[0];
+
692  }
+
693  } else
+
694 #endif
+
695  {
+
696  int maxblocks = reduce_data.maxBlocks();
+
697 #ifdef AMREX_USE_SYCL
+
698  // device reduce needs local(i.e., shared) memory
+
699  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
+
700 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+
701  // xxxxx SYCL todo: reduce bug workaround
+
702  Gpu::DeviceVector<ReduceTuple> dtmp(1);
+
703  auto presult = dtmp.data();
+
704 #else
+
705  auto presult = hp;
+
706 #endif
+
707  amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
+
708  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
+
709  {
+
710  ReduceTuple r;
+
711  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
+
712  ReduceTuple dst = r;
+
713  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
+
714  auto dp_stream = dp+istream*maxblocks;
+
715  for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
+
716  i < nblocks[istream]; i += stride) {
+
717  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
+
718  }
+
719  }
+
720  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
+
721  if (gh.threadIdx() == 0) { *presult = dst; }
+
722  });
+
723 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+
724  Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
+
725 #endif
+
726 #else
+
727  amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
+
728  [=] AMREX_GPU_DEVICE () noexcept
+
729  {
+
730  ReduceTuple r;
+
731  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
+
732  ReduceTuple dst = r;
+
733  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
+
734  auto dp_stream = dp+istream*maxblocks;
+
735  for (int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+
736  i < nblocks[istream]; i += stride) {
+
737  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
+
738  }
+
739  }
+
740  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
+
741  if (threadIdx.x == 0) { *hp = dst; }
+
742  });
+
743 #endif
+
744  Gpu::streamSynchronize();
+
745  }
+
746 
+
747  m_result_is_ready = true;
+
748  return *hp;
+
749  }
+
750 
+
751 private:
+
752  bool m_result_is_ready = false;
+
753 };
+
754 
+
755 namespace Reduce {
+
756 
+
757 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
758 T Sum (N n, T const* v, T init_val = 0)
+
759 {
+
760  ReduceOps<ReduceOpSum> reduce_op;
+
761  ReduceData<T> reduce_data(reduce_op);
+
762  using ReduceTuple = typename decltype(reduce_data)::Type;
+
763  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
+
764  ReduceTuple hv = reduce_data.value(reduce_op);
+
765  return amrex::get<0>(hv) + init_val;
+
766 }
+
767 
+
768 template <typename T, typename N, typename F,
+
769  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
770 T Sum (N n, F&& f, T init_val = 0)
+
771 {
+
772  ReduceOps<ReduceOpSum> reduce_op;
+
773  ReduceData<T> reduce_data(reduce_op);
+
774  using ReduceTuple = typename decltype(reduce_data)::Type;
+
775  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
+
776  ReduceTuple hv = reduce_data.value(reduce_op);
+
777  return amrex::get<0>(hv) + init_val;
+
778 }
+
779 
+
780 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
781 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
+
782 {
+
783  ReduceOps<ReduceOpMin> reduce_op;
+
784  ReduceData<T> reduce_data(reduce_op);
+
785  using ReduceTuple = typename decltype(reduce_data)::Type;
+
786  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
+
787  ReduceTuple hv = reduce_data.value(reduce_op);
+
788  return std::min(amrex::get<0>(hv),init_val);
+
789 }
+
790 
+
791 template <typename T, typename N, typename F,
+
792  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
793 T Min (N n, F&& f, T init_val = std::numeric_limits<T>::max())
+
794 {
+
795  ReduceOps<ReduceOpMin> reduce_op;
+
796  ReduceData<T> reduce_data(reduce_op);
+
797  using ReduceTuple = typename decltype(reduce_data)::Type;
+
798  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
+
799  ReduceTuple hv = reduce_data.value(reduce_op);
+
800  return std::min(amrex::get<0>(hv),init_val);
+
801 }
+
802 
+
803 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
804 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
+
805 {
+
806  ReduceOps<ReduceOpMax> reduce_op;
+
807  ReduceData<T> reduce_data(reduce_op);
+
808  using ReduceTuple = typename decltype(reduce_data)::Type;
+
809  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
+
810  ReduceTuple hv = reduce_data.value(reduce_op);
+
811  return std::max(amrex::get<0>(hv),init_val);
+
812 }
+
813 
+
814 template <typename T, typename N, typename F,
+
815  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
816 T Max (N n, F&& f, T init_val = std::numeric_limits<T>::lowest())
+
817 {
+
818  ReduceOps<ReduceOpMax> reduce_op;
+
819  ReduceData<T> reduce_data(reduce_op);
+
820  using ReduceTuple = typename decltype(reduce_data)::Type;
+
821  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
+
822  ReduceTuple hv = reduce_data.value(reduce_op);
+
823  return std::max(amrex::get<0>(hv),init_val);
+
824 }
+
825 
+
826 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
827 std::pair<T,T> MinMax (N n, T const* v)
+
828 {
+
829  ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;
+
830  ReduceData<T,T> reduce_data(reduce_op);
+
831  using ReduceTuple = typename decltype(reduce_data)::Type;
+
832  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
+
833  return {v[i],v[i]};
+
834  });
+
835  auto hv = reduce_data.value(reduce_op);
+
836  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
+
837 }
+
838 
+
839 template <typename T, typename N, typename F,
+
840  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
841 std::pair<T,T> MinMax (N n, F&& f)
+
842 {
+
843  ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;
+
844  ReduceData<T,T> reduce_data(reduce_op);
+
845  using ReduceTuple = typename decltype(reduce_data)::Type;
+
846  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
+
847  T tmp = f(i);
+
848  return {tmp,tmp};
+
849  });
+
850  auto hv = reduce_data.value(reduce_op);
+
851  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
+
852 }
853 
-
854 #ifdef AMREX_USE_SYCL
-
855  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
-
856  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-
857  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
-
858  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
-
859  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
-
860  if (gh.threadIdx() == 0) { *has_any = *dp; }
-
861  gh.sharedBarrier();
+
854 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
855 bool AnyOf (N n, T const* v, P&& pred)
+
856 {
+
857  Gpu::LaunchSafeGuard lsg(true);
+
858  Gpu::DeviceScalar<int> ds(0);
+
859  int* dp = ds.dataPtr();
+
860  auto ec = Gpu::ExecutionConfig(n);
+
861  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
862 
-
863  if (!(*has_any))
-
864  {
-
865  int r = false;
-
866  for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
-
867  i < n && !r; i += stride)
-
868  {
-
869  r = pred(v[i]) ? 1 : 0;
-
870  }
+
863 #ifdef AMREX_USE_SYCL
+
864  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
+
865  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
+
866  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
+
867  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
+
868  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
+
869  if (gh.threadIdx() == 0) { *has_any = *dp; }
+
870  gh.sharedBarrier();
871 
-
872  r = Gpu::blockReduce<Gpu::Device::warp_size>
-
873  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
-
874  if (gh.threadIdx() == 0 && r) { *dp = 1; }
-
875  }
-
876  });
-
877 #else
-
878  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
-
879  [=] AMREX_GPU_DEVICE () noexcept {
-
880  __shared__ int has_any;
-
881  if (threadIdx.x == 0) has_any = *dp;
-
882  __syncthreads();
-
883 
-
884  if (!has_any)
-
885  {
-
886  int r = false;
-
887  for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-
888  i < n && !r; i += stride)
-
889  {
-
890  r = pred(v[i]) ? 1 : 0;
-
891  }
-
892  r = Gpu::blockReduce<Gpu::Device::warp_size>
-
893  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
-
894  if (threadIdx.x == 0 && r) *dp = 1;
-
895  }
-
896  });
-
897 #endif
-
898  return ds.dataValue();
-
899 }
-
900 
-
901 template <typename P>
-
902 bool AnyOf (Box const& box, P&& pred)
-
903 {
-
904  Gpu::LaunchSafeGuard lsg(true);
-
905  Gpu::DeviceScalar<int> ds(0);
-
906  int* dp = ds.dataPtr();
-
907  int ncells = box.numPts();
-
908  const auto lo = amrex::lbound(box);
-
909  const auto len = amrex::length(box);
-
910  const auto lenxy = len.x*len.y;
-
911  const auto lenx = len.x;
-
912  auto ec = Gpu::ExecutionConfig(ncells);
-
913  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
914 
-
915 #ifdef AMREX_USE_SYCL
-
916  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
-
917  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-
918  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
-
919  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
-
920  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
-
921  if (gh.threadIdx() == 0) { *has_any = *dp; }
-
922  gh.sharedBarrier();
+
872  if (!(*has_any))
+
873  {
+
874  int r = false;
+
875  for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+
876  i < n && !r; i += stride)
+
877  {
+
878  r = pred(v[i]) ? 1 : 0;
+
879  }
+
880 
+
881  r = Gpu::blockReduce<Gpu::Device::warp_size>
+
882  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
+
883  if (gh.threadIdx() == 0 && r) { *dp = 1; }
+
884  }
+
885  });
+
886 #else
+
887  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
+
888  [=] AMREX_GPU_DEVICE () noexcept {
+
889  __shared__ int has_any;
+
890  if (threadIdx.x == 0) has_any = *dp;
+
891  __syncthreads();
+
892 
+
893  if (!has_any)
+
894  {
+
895  int r = false;
+
896  for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+
897  i < n && !r; i += stride)
+
898  {
+
899  r = pred(v[i]) ? 1 : 0;
+
900  }
+
901  r = Gpu::blockReduce<Gpu::Device::warp_size>
+
902  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
+
903  if (threadIdx.x == 0 && r) *dp = 1;
+
904  }
+
905  });
+
906 #endif
+
907  return ds.dataValue();
+
908 }
+
909 
+
910 template <typename P>
+
911 bool AnyOf (Box const& box, P&& pred)
+
912 {
+
913  Gpu::LaunchSafeGuard lsg(true);
+
914  Gpu::DeviceScalar<int> ds(0);
+
915  int* dp = ds.dataPtr();
+
916  int ncells = box.numPts();
+
917  const auto lo = amrex::lbound(box);
+
918  const auto len = amrex::length(box);
+
919  const auto lenxy = len.x*len.y;
+
920  const auto lenx = len.x;
+
921  auto ec = Gpu::ExecutionConfig(ncells);
+
922  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
923 
-
924  if (!(*has_any))
-
925  {
-
926  int r = false;
-
927  for (int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
-
928  icell < ncells && !r; icell += stride) {
-
929  int k = icell / lenxy;
-
930  int j = (icell - k*lenxy) / lenx;
-
931  int i = (icell - k*lenxy) - j*lenx;
-
932  i += lo.x;
-
933  j += lo.y;
-
934  k += lo.z;
-
935  r = pred(i,j,k) ? 1 : 0;
-
936  }
-
937  r = Gpu::blockReduce<Gpu::Device::warp_size>
-
938  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
-
939  if (gh.threadIdx() == 0 && r) { *dp = 1; }
-
940  }
-
941  });
-
942 #else
-
943  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
-
944  Gpu::gpuStream(),
-
945  [=] AMREX_GPU_DEVICE () noexcept {
-
946  __shared__ int has_any;
-
947  if (threadIdx.x == 0) has_any = *dp;
-
948  __syncthreads();
-
949 
-
950  if (!has_any)
-
951  {
-
952  int r = false;
-
953  for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-
954  icell < ncells && !r; icell += stride) {
-
955  int k = icell / lenxy;
-
956  int j = (icell - k*lenxy) / lenx;
-
957  int i = (icell - k*lenxy) - j*lenx;
-
958  i += lo.x;
-
959  j += lo.y;
-
960  k += lo.z;
-
961  r = pred(i,j,k) ? 1 : 0;
-
962  }
-
963  r = Gpu::blockReduce<Gpu::Device::warp_size>
-
964  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
-
965  if (threadIdx.x == 0 && r) *dp = 1;
-
966  }
-
967  });
-
968 #endif
-
969  return ds.dataValue();
-
970 }
-
971 
-
972 }
-
973 
-
974 #else
-
975 
-
976 template <typename... Ts>
-
977 class ReduceData
-
978 {
-
979 public:
-
980  using Type = GpuTuple<Ts...>;
-
981 
-
982  template <typename... Ps>
-
983  explicit ReduceData (ReduceOps<Ps...>& reduce_op)
-
984  : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
-
985  m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
-
986  {
-
987  for (auto& t : m_tuple) {
-
988  Reduce::detail::for_each_init<0, Type, Ps...>(t);
-
989  }
-
990  }
-
991 
-
992  ~ReduceData () = default;
-
993  ReduceData (ReduceData<Ts...> const&) = delete;
-
994  ReduceData (ReduceData<Ts...> &&) = delete;
-
995  void operator= (ReduceData<Ts...> const&) = delete;
-
996  void operator= (ReduceData<Ts...> &&) = delete;
-
997 
-
998  Type value () { return m_fn_value(); }
-
999 
-
1000  template <typename... Ps>
-
1001  Type value (ReduceOps<Ps...>& reduce_op)
-
1002  {
-
1003  return reduce_op.value(*this);
-
1004  }
-
1005 
-
1006  Vector<Type>& reference () { return m_tuple; }
-
1007 
-
1008  Type& reference (int tid)
-
1009  {
-
1010  if (m_tuple.size() == 1) {
-
1011  // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
-
1012  return m_tuple[0];
-
1013  } else {
-
1014  return m_tuple[tid];
-
1015  }
-
1016  }
-
1017 
-
1018 private:
-
1019  Vector<Type> m_tuple;
-
1020  std::function<Type()> m_fn_value;
-
1021 };
-
1022 
-
1023 template <typename... Ps>
-
1024 class ReduceOps
-
1025 {
-
1026 private:
-
1027 
-
1028  template <typename D, typename F>
-
1029  AMREX_FORCE_INLINE
-
1030  static auto call_f (Box const& box, typename D::Type & r, F const& f)
-
1031  noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(f(0,0,0))>,
-
1032  typename D::Type>::value>
-
1033  {
-
1034  using ReduceTuple = typename D::Type;
-
1035  const auto lo = amrex::lbound(box);
-
1036  const auto hi = amrex::ubound(box);
-
1037  for (int k = lo.z; k <= hi.z; ++k) {
-
1038  for (int j = lo.y; j <= hi.y; ++j) {
-
1039  for (int i = lo.x; i <= hi.x; ++i) {
-
1040  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));
-
1041  }}}
-
1042  }
-
1043 
-
1044  template <typename D, typename F>
-
1045  AMREX_FORCE_INLINE
-
1046  static auto call_f (Box const& box, typename D::Type & r, F const& f)
-
1047  noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(f(Box()))>,
-
1048  typename D::Type>::value>
-
1049  {
-
1050  using ReduceTuple = typename D::Type;
-
1051  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
-
1052  }
-
1053 
-
1054 public:
-
1055 
-
1056  template <typename MF, typename D, typename F>
-
1057  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
-
1058  eval (MF const& mf, IntVect const& nghost, D & reduce_data, F&& f)
-
1059  {
-
1060  using ReduceTuple = typename D::Type;
-
1061 #ifdef AMREX_USE_OMP
-
1062 #pragma omp parallel
-
1063 #endif
-
1064  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
-
1065  Box const& b = mfi.growntilebox(nghost);
-
1066  const int li = mfi.LocalIndex();
-
1067  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
-
1068  const auto lo = amrex::lbound(b);
-
1069  const auto hi = amrex::ubound(b);
-
1070  for (int k = lo.z; k <= hi.z; ++k) {
-
1071  for (int j = lo.y; j <= hi.y; ++j) {
-
1072  for (int i = lo.x; i <= hi.x; ++i) {
-
1073  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
-
1074  }}}
-
1075  }
-
1076  }
-
1077 
-
1078  template <typename MF, typename D, typename F>
-
1079  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
-
1080  eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F&& f)
-
1081  {
-
1082  using ReduceTuple = typename D::Type;
-
1083 #ifdef AMREX_USE_OMP
-
1084 #pragma omp parallel
-
1085 #endif
-
1086  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
-
1087  Box const& b = mfi.growntilebox(nghost);
-
1088  const int li = mfi.LocalIndex();
-
1089  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
-
1090  const auto lo = amrex::lbound(b);
-
1091  const auto hi = amrex::ubound(b);
-
1092  for (int n = 0; n < ncomp; ++n) {
-
1093  for (int k = lo.z; k <= hi.z; ++k) {
-
1094  for (int j = lo.y; j <= hi.y; ++j) {
-
1095  for (int i = lo.x; i <= hi.x; ++i) {
-
1096  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
-
1097  }}}}
-
1098  }
-
1099  }
-
1100 
-
1101  template <typename D, typename F>
-
1102  void eval (Box const& box, D & reduce_data, F&& f)
-
1103  {
-
1104  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
-
1105  call_f<D>(box, rr, f);
-
1106  }
-
1107 
-
1108  template <typename N, typename D, typename F,
-
1109  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1110  void eval (Box const& box, N ncomp, D & reduce_data, F&& f)
-
1111  {
-
1112  using ReduceTuple = typename D::Type;
+
924 #ifdef AMREX_USE_SYCL
+
925  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
+
926  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
+
927  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
+
928  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
+
929  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
+
930  if (gh.threadIdx() == 0) { *has_any = *dp; }
+
931  gh.sharedBarrier();
+
932 
+
933  if (!(*has_any))
+
934  {
+
935  int r = false;
+
936  for (int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+
937  icell < ncells && !r; icell += stride) {
+
938  int k = icell / lenxy;
+
939  int j = (icell - k*lenxy) / lenx;
+
940  int i = (icell - k*lenxy) - j*lenx;
+
941  i += lo.x;
+
942  j += lo.y;
+
943  k += lo.z;
+
944  r = pred(i,j,k) ? 1 : 0;
+
945  }
+
946  r = Gpu::blockReduce<Gpu::Device::warp_size>
+
947  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
+
948  if (gh.threadIdx() == 0 && r) { *dp = 1; }
+
949  }
+
950  });
+
951 #else
+
952  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
+
953  Gpu::gpuStream(),
+
954  [=] AMREX_GPU_DEVICE () noexcept {
+
955  __shared__ int has_any;
+
956  if (threadIdx.x == 0) has_any = *dp;
+
957  __syncthreads();
+
958 
+
959  if (!has_any)
+
960  {
+
961  int r = false;
+
962  for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+
963  icell < ncells && !r; icell += stride) {
+
964  int k = icell / lenxy;
+
965  int j = (icell - k*lenxy) / lenx;
+
966  int i = (icell - k*lenxy) - j*lenx;
+
967  i += lo.x;
+
968  j += lo.y;
+
969  k += lo.z;
+
970  r = pred(i,j,k) ? 1 : 0;
+
971  }
+
972  r = Gpu::blockReduce<Gpu::Device::warp_size>
+
973  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
+
974  if (threadIdx.x == 0 && r) *dp = 1;
+
975  }
+
976  });
+
977 #endif
+
978  return ds.dataValue();
+
979 }
+
980 
+
981 }
+
982 
+
983 #else
+
984 
+
985 template <typename... Ts>
+
986 class ReduceData
+
987 {
+
988 public:
+
989  using Type = GpuTuple<Ts...>;
+
990 
+
991  template <typename... Ps>
+
992  explicit ReduceData (ReduceOps<Ps...>& reduce_op)
+
993  : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
+
994  m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
+
995  {
+
996  for (auto& t : m_tuple) {
+
997  Reduce::detail::for_each_init<0, Type, Ps...>(t);
+
998  }
+
999  }
+
1000 
+
1001  ~ReduceData () = default;
+
1002  ReduceData (ReduceData<Ts...> const&) = delete;
+
1003  ReduceData (ReduceData<Ts...> &&) = delete;
+
1004  void operator= (ReduceData<Ts...> const&) = delete;
+
1005  void operator= (ReduceData<Ts...> &&) = delete;
+
1006 
+
1007  Type value () { return m_fn_value(); }
+
1008 
+
1009  template <typename... Ps>
+
1010  Type value (ReduceOps<Ps...>& reduce_op)
+
1011  {
+
1012  return reduce_op.value(*this);
+
1013  }
+
1014 
+
1015  Vector<Type>& reference () { return m_tuple; }
+
1016 
+
1017  Type& reference (int tid)
+
1018  {
+
1019  if (m_tuple.size() == 1) {
+
1020  // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
+
1021  return m_tuple[0];
+
1022  } else {
+
1023  return m_tuple[tid];
+
1024  }
+
1025  }
+
1026 
+
1027 private:
+
1028  Vector<Type> m_tuple;
+
1029  std::function<Type()> m_fn_value;
+
1030 };
+
1031 
+
1032 template <typename... Ps>
+
1033 class ReduceOps
+
1034 {
+
1035 private:
+
1036 
+
1037  template <typename D, typename F>
+
1038  AMREX_FORCE_INLINE
+
1039  static auto call_f (Box const& box, typename D::Type & r, F const& f)
+
1040  noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(f(0,0,0))>,
+
1041  typename D::Type>::value>
+
1042  {
+
1043  using ReduceTuple = typename D::Type;
+
1044  const auto lo = amrex::lbound(box);
+
1045  const auto hi = amrex::ubound(box);
+
1046  for (int k = lo.z; k <= hi.z; ++k) {
+
1047  for (int j = lo.y; j <= hi.y; ++j) {
+
1048  for (int i = lo.x; i <= hi.x; ++i) {
+
1049  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));
+
1050  }}}
+
1051  }
+
1052 
+
1053  template <typename D, typename F>
+
1054  AMREX_FORCE_INLINE
+
1055  static auto call_f (Box const& box, typename D::Type & r, F const& f)
+
1056  noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(f(Box()))>,
+
1057  typename D::Type>::value>
+
1058  {
+
1059  using ReduceTuple = typename D::Type;
+
1060  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
+
1061  }
+
1062 
+
1063 public:
+
1064 
+
1065  template <typename MF, typename D, typename F>
+
1066  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
+
1067  eval (MF const& mf, IntVect const& nghost, D & reduce_data, F&& f)
+
1068  {
+
1069  using ReduceTuple = typename D::Type;
+
1070 #ifdef AMREX_USE_OMP
+
1071 #pragma omp parallel
+
1072 #endif
+
1073  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+
1074  Box const& b = mfi.growntilebox(nghost);
+
1075  const int li = mfi.LocalIndex();
+
1076  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
+
1077  const auto lo = amrex::lbound(b);
+
1078  const auto hi = amrex::ubound(b);
+
1079  for (int k = lo.z; k <= hi.z; ++k) {
+
1080  for (int j = lo.y; j <= hi.y; ++j) {
+
1081  for (int i = lo.x; i <= hi.x; ++i) {
+
1082  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
+
1083  }}}
+
1084  }
+
1085  }
+
1086 
+
1087  template <typename MF, typename D, typename F>
+
1088  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
+
1089  eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F&& f)
+
1090  {
+
1091  using ReduceTuple = typename D::Type;
+
1092 #ifdef AMREX_USE_OMP
+
1093 #pragma omp parallel
+
1094 #endif
+
1095  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+
1096  Box const& b = mfi.growntilebox(nghost);
+
1097  const int li = mfi.LocalIndex();
+
1098  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
+
1099  const auto lo = amrex::lbound(b);
+
1100  const auto hi = amrex::ubound(b);
+
1101  for (int n = 0; n < ncomp; ++n) {
+
1102  for (int k = lo.z; k <= hi.z; ++k) {
+
1103  for (int j = lo.y; j <= hi.y; ++j) {
+
1104  for (int i = lo.x; i <= hi.x; ++i) {
+
1105  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
+
1106  }}}}
+
1107  }
+
1108  }
+
1109 
+
1110  template <typename D, typename F>
+
1111  void eval (Box const& box, D & reduce_data, F&& f)
+
1112  {
1113  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
-
1114  const auto lo = amrex::lbound(box);
-
1115  const auto hi = amrex::ubound(box);
-
1116  for (N n = 0; n < ncomp; ++n) {
-
1117  for (int k = lo.z; k <= hi.z; ++k) {
-
1118  for (int j = lo.y; j <= hi.y; ++j) {
-
1119  for (int i = lo.x; i <= hi.x; ++i) {
-
1120  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
-
1121  }}}}
-
1122  }
-
1123 
-
1124  template <typename N, typename D, typename F,
-
1125  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1126  void eval (N n, D & reduce_data, F&& f)
-
1127  {
-
1128  using ReduceTuple = typename D::Type;
-
1129  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
-
1130  for (N i = 0; i < n; ++i) {
-
1131  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
-
1132  }
-
1133  }
-
1134 
-
1135  template <typename D>
-
1136  typename D::Type value (D & reduce_data)
-
1137  {
-
1138  using ReduceTuple = typename D::Type;
-
1139  auto& rrv = reduce_data.reference();
-
1140  if (rrv.size() > 1) {
-
1141  for (int i = 1, N = rrv.size(); i < N; ++i) {
-
1142  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
-
1143  }
-
1144  }
-
1145  return rrv[0];
-
1146  }
-
1147 };
-
1148 
-
1149 namespace Reduce {
-
1150 
-
1151 template <typename T, typename N, typename F,
-
1152  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1153 T Sum (N n, F&& f, T init_val = 0)
-
1154 {
-
1155  T r = init_val;
-
1156 #ifdef AMREX_USE_OMP
-
1157 #pragma omp parallel for reduction(+:r)
-
1158 #endif
-
1159  for (N i = 0; i < n; ++i) {
-
1160  r += f(i);
-
1161  }
-
1162  return r;
-
1163 }
+
1114  call_f<D>(box, rr, f);
+
1115  }
+
1116 
+
1117  template <typename N, typename D, typename F,
+
1118  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1119  void eval (Box const& box, N ncomp, D & reduce_data, F&& f)
+
1120  {
+
1121  using ReduceTuple = typename D::Type;
+
1122  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
+
1123  const auto lo = amrex::lbound(box);
+
1124  const auto hi = amrex::ubound(box);
+
1125  for (N n = 0; n < ncomp; ++n) {
+
1126  for (int k = lo.z; k <= hi.z; ++k) {
+
1127  for (int j = lo.y; j <= hi.y; ++j) {
+
1128  for (int i = lo.x; i <= hi.x; ++i) {
+
1129  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
+
1130  }}}}
+
1131  }
+
1132 
+
1133  template <typename N, typename D, typename F,
+
1134  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1135  void eval (N n, D & reduce_data, F&& f)
+
1136  {
+
1137  using ReduceTuple = typename D::Type;
+
1138  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
+
1139  for (N i = 0; i < n; ++i) {
+
1140  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
+
1141  }
+
1142  }
+
1143 
+
1144  template <typename D>
+
1145  typename D::Type value (D & reduce_data)
+
1146  {
+
1147  auto& rrv = reduce_data.reference();
+
1148  if (! m_result_is_ready) {
+
1149  using ReduceTuple = typename D::Type;
+
1150  if (rrv.size() > 1) {
+
1151  for (int i = 1, N = rrv.size(); i < N; ++i) {
+
1152  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
+
1153  }
+
1154  }
+
1155  m_result_is_ready = true;
+
1156  }
+
1157  return rrv[0];
+
1158  }
+
1159 
+
1160  bool m_result_is_ready = false;
+
1161 };
+
1162 
+
1163 namespace Reduce {
1164 
-
1165 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1166 T Sum (N n, T const* v, T init_val = 0)
-
1167 {
-
1168  return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
-
1169 }
-
1170 
-
1171 template <typename T, typename N, typename F,
-
1172  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1173 T Min (N n, F&& f, T init_val = std::numeric_limits<T>::max())
-
1174 {
-
1175  T r = init_val;
-
1176 #ifdef AMREX_USE_OMP
-
1177 #pragma omp parallel for reduction(min:r)
-
1178 #endif
-
1179  for (N i = 0; i < n; ++i) {
-
1180  r = std::min(r,f(i));
-
1181  }
-
1182  return r;
+
1165 template <typename T, typename N, typename F,
+
1166  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1167 T Sum (N n, F&& f, T init_val = 0)
+
1168 {
+
1169  T r = init_val;
+
1170 #ifdef AMREX_USE_OMP
+
1171 #pragma omp parallel for reduction(+:r)
+
1172 #endif
+
1173  for (N i = 0; i < n; ++i) {
+
1174  r += f(i);
+
1175  }
+
1176  return r;
+
1177 }
+
1178 
+
1179 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1180 T Sum (N n, T const* v, T init_val = 0)
+
1181 {
+
1182  return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1183 }
1184 
-
1185 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1186 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
-
1187 {
-
1188  return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
-
1189 }
-
1190 
-
1191 template <typename T, typename N, typename F,
-
1192  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1193 T Max (N n, F&& f, T init_val = std::numeric_limits<T>::lowest())
-
1194 {
-
1195  T r = init_val;
-
1196 #ifdef AMREX_USE_OMP
-
1197 #pragma omp parallel for reduction(max:r)
-
1198 #endif
-
1199  for (N i = 0; i < n; ++i) {
-
1200  r = std::max(r,f(i));
-
1201  }
-
1202  return r;
+
1185 template <typename T, typename N, typename F,
+
1186  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1187 T Min (N n, F&& f, T init_val = std::numeric_limits<T>::max())
+
1188 {
+
1189  T r = init_val;
+
1190 #ifdef AMREX_USE_OMP
+
1191 #pragma omp parallel for reduction(min:r)
+
1192 #endif
+
1193  for (N i = 0; i < n; ++i) {
+
1194  r = std::min(r,f(i));
+
1195  }
+
1196  return r;
+
1197 }
+
1198 
+
1199 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1200 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
+
1201 {
+
1202  return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1203 }
1204 
-
1205 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1206 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
-
1207 {
-
1208  return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
-
1209 }
-
1210 
-
1211 template <typename T, typename N, typename F,
-
1212  typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1213 std::pair<T,T> Min (N n, F&& f)
-
1214 {
-
1215  T r_min = std::numeric_limits<T>::max();
-
1216  T r_max = std::numeric_limits<T>::lowest();
-
1217 #ifdef AMREX_USE_OMP
-
1218 #pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
-
1219 #endif
-
1220  for (N i = 0; i < n; ++i) {
-
1221  T tmp = f(i);
-
1222  r_min = std::min(r_min,tmp);
-
1223  r_max = std::max(r_max,tmp);
-
1224  }
-
1225  return std::make_pair(r_min,r_max);
-
1226 }
-
1227 
-
1228 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1229 std::pair<T,T> MinMax (N n, T const* v)
-
1230 {
-
1231  return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
-
1232 }
-
1233 
-
1234 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
-
1235 bool AnyOf (N n, T const* v, P&& pred)
-
1236 {
-
1237  return std::any_of(v, v+n, pred);
-
1238 }
-
1239 
-
1240 template <typename P>
-
1241 bool AnyOf (Box const& box, P&&pred)
-
1242 {
-
1243  const auto lo = amrex::lbound(box);
-
1244  const auto hi = amrex::ubound(box);
-
1245  for (int k = lo.z; k <= hi.z; ++k) {
-
1246  for (int j = lo.y; j <= hi.y; ++j) {
-
1247  for (int i = lo.x; i <= hi.x; ++i) {
-
1248  if (pred(i,j,k)) return true;
-
1249  }}}
-
1250  return false;
-
1251 }
-
1252 
-
1253 }
-
1254 
-
1255 #endif
-
1256 
-
1257 }
-
1258 
-
1259 #endif
+
1205 template <typename T, typename N, typename F,
+
1206  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1207 T Max (N n, F&& f, T init_val = std::numeric_limits<T>::lowest())
+
1208 {
+
1209  T r = init_val;
+
1210 #ifdef AMREX_USE_OMP
+
1211 #pragma omp parallel for reduction(max:r)
+
1212 #endif
+
1213  for (N i = 0; i < n; ++i) {
+
1214  r = std::max(r,f(i));
+
1215  }
+
1216  return r;
+
1217 }
+
1218 
+
1219 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1220 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
+
1221 {
+
1222  return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
+
1223 }
+
1224 
+
1225 template <typename T, typename N, typename F,
+
1226  typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1227 std::pair<T,T> Min (N n, F&& f)
+
1228 {
+
1229  T r_min = std::numeric_limits<T>::max();
+
1230  T r_max = std::numeric_limits<T>::lowest();
+
1231 #ifdef AMREX_USE_OMP
+
1232 #pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
+
1233 #endif
+
1234  for (N i = 0; i < n; ++i) {
+
1235  T tmp = f(i);
+
1236  r_min = std::min(r_min,tmp);
+
1237  r_max = std::max(r_max,tmp);
+
1238  }
+
1239  return std::make_pair(r_min,r_max);
+
1240 }
+
1241 
+
1242 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1243 std::pair<T,T> MinMax (N n, T const* v)
+
1244 {
+
1245  return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
+
1246 }
+
1247 
+
1248 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
+
1249 bool AnyOf (N n, T const* v, P&& pred)
+
1250 {
+
1251  return std::any_of(v, v+n, pred);
+
1252 }
+
1253 
+
1254 template <typename P>
+
1255 bool AnyOf (Box const& box, P&&pred)
+
1256 {
+
1257  const auto lo = amrex::lbound(box);
+
1258  const auto hi = amrex::ubound(box);
+
1259  for (int k = lo.z; k <= hi.z; ++k) {
+
1260  for (int j = lo.y; j <= hi.y; ++j) {
+
1261  for (int i = lo.x; i <= hi.x; ++i) {
+
1262  if (pred(i,j,k)) return true;
+
1263  }}}
+
1264  return false;
+
1265 }
+
1266 
+
1267 }
+
1268 
+
1269 #endif
+
1270 
+
1271 }
+
1272 
+
1273 #endif
AMReX_Arena.H
AMREX_FORCE_INLINE
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:116
AMREX_GPU_MAX_STREAMS
#define AMREX_GPU_MAX_STREAMS
Definition: AMReX_GpuDevice.H:19
@@ -1431,11 +1445,11 @@
amrex::Reduce::detail::for_each_init
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:77
amrex::Reduce::detail::for_each_parallel
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:38
amrex::Reduce::detail::for_each_local
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:55
-
amrex::Reduce::Max
T Max(N n, F &&f, T init_val=std::numeric_limits< T >::lowest())
Definition: AMReX_Reduce.H:807
-
amrex::Reduce::Sum
T Sum(N n, F &&f, T init_val=0)
Definition: AMReX_Reduce.H:761
-
amrex::Reduce::AnyOf
bool AnyOf(Box const &box, P &&pred)
Definition: AMReX_Reduce.H:902
-
amrex::Reduce::Min
T Min(N n, F &&f, T init_val=std::numeric_limits< T >::max())
Definition: AMReX_Reduce.H:784
-
amrex::Reduce::MinMax
std::pair< T, T > MinMax(N n, F &&f)
Definition: AMReX_Reduce.H:832
+
amrex::Reduce::Max
T Max(N n, F &&f, T init_val=std::numeric_limits< T >::lowest())
Definition: AMReX_Reduce.H:816
+
amrex::Reduce::Sum
T Sum(N n, F &&f, T init_val=0)
Definition: AMReX_Reduce.H:770
+
amrex::Reduce::AnyOf
bool AnyOf(Box const &box, P &&pred)
Definition: AMReX_Reduce.H:911
+
amrex::Reduce::Min
T Min(N n, F &&f, T init_val=std::numeric_limits< T >::max())
Definition: AMReX_Reduce.H:793
+
amrex::Reduce::MinMax
std::pair< T, T > MinMax(N n, F &&f)
Definition: AMReX_Reduce.H:841
amrex::SundialsUserFun::f
static int f(realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:40
amrex::detail::min
@ min
Definition: AMReX_ParallelReduce.H:18
amrex::detail::max
@ max
Definition: AMReX_ParallelReduce.H:17
diff --git a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html index 098bae81d8..84b74e9aae 100644 --- a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html +++ b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps-members.html @@ -108,7 +108,8 @@ eval(Box const &box, N ncomp, D &reduce_data, F &&f)amrex::ReduceOps< Ps >inline eval(N n, D &reduce_data, F &&f)amrex::ReduceOps< Ps >inline eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)amrex::ReduceOps< Ps >inline - value(D &reduce_data)amrex::ReduceOps< Ps >inline + m_result_is_readyamrex::ReduceOps< Ps >private + value(D &reduce_data)amrex::ReduceOps< Ps >inline diff --git a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html index fa30b2a7a0..f7c59a89db 100644 --- a/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html +++ b/amrex/docs_html/doxygen/classamrex_1_1ReduceOps.html @@ -97,6 +97,7 @@
Public Member Functions | +Private Attributes | List of all members
amrex::ReduceOps< Ps > Class Template Reference
@@ -128,6 +129,11 @@ template<typename D > D::Type value (D &reduce_data)   + + + +

+Private Attributes

bool m_result_is_ready = false
 

Member Function Documentation

@@ -476,6 +482,31 @@

+

+ +

Member Data Documentation

+
+

◆ m_result_is_ready

+ +
+
+
+template<typename... Ps>
+ + + + + +
+ + + + +
bool amrex::ReduceOps< Ps >::m_result_is_ready = false
+
+private
+
+

The documentation for this class was generated from the following file: