From 09600886fbc120cd81d1dd289d9db8af56d0927e Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Mon, 19 Aug 2019 11:14:00 -0700 Subject: [PATCH 01/17] add extensive profiling to the CG iteration --- src/noise_routines.f90 | 90 +++++++++++++++++++++++++++++++---------- src/smadam_routines.f90 | 31 ++++++++++++-- 2 files changed, 96 insertions(+), 25 deletions(-) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 8e81426..3e4bacc 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -1137,7 +1137,7 @@ END SUBROUTINE construct_preconditioner !------------------------------------------------------------------------------ - SUBROUTINE preconditioning_band(z, r, nna) + SUBROUTINE preconditioning_band(z, r, nna, istep) !Apply the preconditioner ! z and r may be 3-dimensional arrays in the calling code but @@ -1147,14 +1147,19 @@ SUBROUTINE preconditioning_band(z, r, nna) real(dp), intent(in), target :: r(0:basis_order, noba_short, nodetectors) real(dp), intent(in) :: & nna(0:basis_order, 0:basis_order, noba_short, nodetectors) + integer, intent(in) :: istep integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, & - num_threads, m, nband, ipsd, isub + num_threads, m, nband, ipsd, isub, niter real(dp) :: t1, t2, tf real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) + ! DEBUG begin + real(dp) :: time_cholesky(100), time_cgprecond(100), time_cgprecond_filter(100) + integer :: ncall_cholesky(100), ncall_cgprecond(100), niter_cgprecond(100) + ! DEBUG end z = r @@ -1177,14 +1182,25 @@ SUBROUTINE preconditioning_band(z, r, nna) pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) end do + ! DEBUG begin + time_cholesky = 0 + time_cgprecond = 0 + time_cgprecond_filter = 0 + ncall_cholesky = 0 + ncall_cgprecond = 0 + niter_cgprecond = 0 + ! DEBUG end + !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, & !$OMP id, nof, nshort, detectors, nthreads, fprecond, nna, & - !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx) & + !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx, & + !$OMP time_cholesky, ncall_cholesky, time_cgprecond, & + !$OMP ncall_cgprecond, niter_cgprecond, time_cgprecond_filter) & !$OMP PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, & !$OMP xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, & - !$OMP t1, t2, tf) + !$OMP t1, t2, tf, niter) !t1 = get_time(55) !tf = 0 @@ -1211,16 +1227,30 @@ SUBROUTINE preconditioning_band(z, r, nna) ipsd = psd_index(idet, baselines_short_time(kstart+1)) if (allocated(bandprec(isub, ichunk, idet)%data)) then ! Use the precomputed Cholesky decomposition + t1 = MPI_Wtime() ! DEBUG nband = size(bandprec(isub, ichunk, idet)%data, 1) call apply_cholesky_decomposition( & noba, z(0, kstart+1, idet), r(0, kstart+1, idet), & nband, bandprec(isub, ichunk, idet)%data) + ! DEBUG begin + t2 = MPI_Wtime() + time_cholesky(id_thread) = time_cholesky(id_thread) + (t2 - t1) + ncall_cholesky(id_thread) = ncall_cholesky(id_thread) + 1 + ! DEBUG end else ! Use CG iteration to apply the preconditioner + t1 = MPI_Wtime() ! DEBUG call apply_CG_preconditioner( & noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), & z(0, kstart+1, idet), r(0, kstart+1, idet), & - invcov(1, ipsd), fx, xx, tf) + invcov(1, ipsd), fx, xx, tf, niter) + ! DEBUG begin + t2 = MPI_Wtime() + time_cgprecond(id_thread) = time_cgprecond(id_thread) + (t2 - t1) + time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf + niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter + ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1 + ! DEBUG end end if end if kstart = kstart + noba @@ -1237,6 +1267,19 @@ SUBROUTINE preconditioning_band(z, r, nna) end do end if + ! DEBUG begin + do id_thread = 0, nthreads - 1 + write(1000 + id, *) "step", istep, "thread", id_thread, & + "time_cholesky", time_cholesky(id_thread), & + "ncall_cholesky", ncall_cholesky(id_thread) + write(1000 + id, *) "step", istep, "thread", id_thread, & + "time_cgprecond", time_cgprecond(id_thread), & + "ncall_cgprecond", ncall_cgprecond(id_thread), & + "niter_cgprecond", niter_cgprecond(id_thread), & + "time_cgprecond_filter", time_cgprecond_filter(id_thread) + end do + ! DEBUG end + cputime_precond = cputime_precond + get_time(16) END SUBROUTINE preconditioning_band @@ -1289,7 +1332,7 @@ end subroutine get_cholesky_decomposition subroutine apply_CG_preconditioner( & - noba, nof, fcov, nna, x, b, invcov, fx, xx, tf) + noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, niter) ! ! Apply the preconditioner using CG iteration ! @@ -1298,9 +1341,10 @@ subroutine apply_CG_preconditioner( & real(dp), intent(in) :: nna(noba), invcov(noba) real(dp), intent(out) :: x(noba) real(dp), intent(in) :: b(noba) - real(dp), intent(inout) :: tf real(C_DOUBLE), intent(inout) :: xx(nof) complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1) + real(dp), intent(inout) :: tf + integer, intent(inout) :: niter real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:) real(dp) :: alpha, beta, rr, rr0, rr_old, rz, rz_old, Anorm @@ -1310,6 +1354,7 @@ subroutine apply_CG_preconditioner( & ! Polak-Ribiere formula to allow for a changing preconditioner real(dp), parameter :: cglimit = 1e-6 integer, parameter :: itermax = 100 + real(dp) :: tfilter allocate(resid(noba), prop(noba), Aprop(noba), zresid(noba), & precond(noba), stat=ierr) @@ -1328,8 +1373,9 @@ subroutine apply_CG_preconditioner( & prop = zresid iter = 1 + tfilter = 0 do - call apply_A(prop, Aprop, tf) + call apply_A(prop, Aprop) Anorm = dot_product(prop, Aprop) alpha = rz / Anorm x = x + alpha * prop @@ -1348,6 +1394,9 @@ subroutine apply_CG_preconditioner( & deallocate(resid, prop, Aprop, zresid, precond) + tf = tfilter + niter = iter + contains subroutine apply_precond(x, z) @@ -1360,34 +1409,33 @@ subroutine apply_precond(x, z) z = precond * x end subroutine apply_precond - subroutine apply_A(x, y, tf) + subroutine apply_A(x, y) ! ! Apply the square matrix `A` to `x` and place the result in `y` ! real(dp), intent(in) :: x(noba) real(dp), intent(out) :: y(noba) - real(dp), intent(inout) :: tf - call convolve(x, fcov, y, tf) - y = y + nna*x + call convolve(x, fcov, y) + y = y + nna * x end subroutine apply_A - subroutine convolve(x, fc, y, tf) + subroutine convolve(x, fc, y) ! ! Convolve `x` with filter `fc` and place result in `y`. ! real(dp), intent(in) :: x(noba) - complex(dp) :: fc(nof/2 + 1) + complex(dp) :: fc(nof / 2 + 1) real(dp), intent(out) :: y(noba) - real(dp), intent(inout) :: tf - real(dp) :: t1, t2 + real(dp) :: t xx(:noba) = x - sum(x)/noba - xx(noba+1:) = 0 - !t1 = get_time(56) + xx(noba + 1:) = 0 + t = MPI_Wtime() call dfft(fx, xx) - fx = fx*fc + tfilter = tfilter + (MPI_Wtime() - t) + fx = fx * fc + t = MPI_Wtime() call dfftinv(xx, fx) - !t2 = get_time(56) - !tf = tf + t2 - t1 + tfilter = tfilter + (MPI_Wtime() - t) y = xx(:noba) y = y - sum(y) / noba end subroutine convolve diff --git a/src/smadam_routines.f90 b/src/smadam_routines.f90 index 534ae62..9defdaa 100644 --- a/src/smadam_routines.f90 +++ b/src/smadam_routines.f90 @@ -701,6 +701,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) real(dp), allocatable, target :: ap_all_threads(:, :, :, :) real(dp), pointer :: ap_thread(:, :, :) real(dp), allocatable :: resid(:) + real(dp) :: t0, t1, t2, t3 if (info > 4) write(*,idf) ID,'Begin iteration...' @@ -730,7 +731,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) r = yba - call apply_preconditioner(z, r) + call apply_preconditioner(z, r, -1) p = z rz = sum(r * z, mask=rmask) @@ -786,6 +787,8 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) if (istep >= iter_max) exit istep = istep + 1 + t1 = MPI_Wtime() ! DEBUG + call reset_time(12) ! 1) evaluate A.p @@ -810,23 +813,33 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) end if end if + t2 = MPI_Wtime() ! DEBUG + write(1000 + id, *) "step", istep, "baseline_to_map", t2 - t1 ! DEBUG + ! Communicate maps between processes call wait_mpi cputime_cga_1 = cputime_cga_1 + get_time_and_reset(12) wamap = 0 + t1 = MPI_Wtime() ! DEBUG call collect_map(wamap, nosubpix_cross, .true.) ! locmap -> wamap + write(1000 + id, *) "step", istep, "collect_map", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_mpi_reduce = cputime_cga_mpi_reduce + get_time_and_reset(12) ! apply cca, rejects masked pixels + t1 = MPI_Wtime() ! DEBUG call ccmultiply(cca, wamap, nopix_cross) + write(1000 + id, *) "step", istep, "ccmultiply", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_cc = cputime_cga_cc + get_time_and_reset(12) + t1 = MPI_Wtime() ! DEBUG call scatter_map(wamap, nosubpix_cross) ! wamap -> locmap + write(1000 + id, *) "step", istep, "scatter_map", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_mpi_scatter = cputime_cga_mpi_scatter & + get_time_and_reset(12) + t1 = MPI_Wtime() ! DEBUG if (kfilter) then call cinvmul(ap, p, nna) else @@ -843,11 +856,13 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) end do end if end if + write(1000 + id, *) "step", istep, "cinvmul", MPI_Wtime() - t1 ! DEBUG ! From map to baseline call reset_time(12) + t1 = MPI_Wtime() ! DEBUG ap_all_threads = 0 if (basis_order == 0) then @@ -868,10 +883,12 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) ap = ap + sum(ap_all_threads, dim=4) + write(1000 + id, *) "step", istep, "map_to_baseline", MPI_Wtime() - t1 ! DEBUG cputime_cga_2 = cputime_cga_2 + get_time(12) ! 2) Evaluate p^T.A.p + t1 = MPI_Wtime() ! DEBUG pap = sum(p*ap, mask=rmask) call sum_mpi(pap) @@ -886,12 +903,16 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) aa(:, 1:noba_short, 1:nodetectors) + alpha*p r = r - alpha*ap + write(1000 + id, *) "step", istep, "update_guess", MPI_Wtime() - t1 ! DEBUG ! 5) Precondition - call apply_preconditioner(z, r) + t1 = MPI_Wtime() ! DEBUG + call apply_preconditioner(z, r, istep) + write(1000 + id, *) "step", istep, "precondition", MPI_Wtime() - t1 ! DEBUG ! 6) Check for convergence + t1 = MPI_Wtime() ! DEBUG rzo = rz rz = sum(r * z, mask=rmask) rz2 = sum(ro * z, mask=rmask) @@ -905,6 +926,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) ! This is the Polak-Ribiere formula that ! allows for updates to the preconditioner beta = (rz - rz2) / rzo + write(1000 + id, *) "step", istep, "check_convergence", MPI_Wtime() - t1 ! DEBUG if (ID==0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') & istep, rr/rrinit, rz2/rz, alpha, beta, get_time_and_reset(99) @@ -936,12 +958,13 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) contains - subroutine apply_preconditioner(z, r) + subroutine apply_preconditioner(z, r, istep) real(dp), intent(in) :: r(0:basis_order, noba_short, nodetectors) real(dp), intent(out) :: z(0:basis_order, noba_short, nodetectors) + integer, intent(in) :: istep integer :: i, idet if (basis_order == 0) then - call preconditioning_band(z, r, nna) + call preconditioning_band(z, r, nna, istep) else do idet = 1, nodetectors do i = 1, noba_short From 8f290c238cc02b90771848c3368510b1b100dbe1 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Mon, 19 Aug 2019 12:39:18 -0700 Subject: [PATCH 02/17] Fix index error --- src/noise_routines.f90 | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 3e4bacc..7d3eae4 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -1157,8 +1157,8 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) ! DEBUG begin - real(dp) :: time_cholesky(100), time_cgprecond(100), time_cgprecond_filter(100) - integer :: ncall_cholesky(100), ncall_cgprecond(100), niter_cgprecond(100) + real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), time_cgprecond_filter(:) + integer, allocatable :: ncall_cholesky(:), ncall_cgprecond(:), niter_cgprecond(:) ! DEBUG end z = r @@ -1169,6 +1169,18 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) call reset_time(16) + ! DEBUG begin + allocate(time_cholesky(0:nthreads-1), time_cgprecond(0:nthreads-1), & + time_cgprecond_filter(0:nthreads-1), ncall_cholesky(0:nthreads-1), & + ncall_cgprecond(0:nthreads-1), niter_cgprecond(0:nthreads-1)) + time_cholesky = 0 + time_cgprecond = 0 + time_cgprecond_filter = 0 + ncall_cholesky = 0 + ncall_cgprecond = 0 + niter_cgprecond = 0 + ! DEBUG end + if (use_diagonal) then z(0, :, :) = r(0, :, :) * prec_diag else if (use_fprecond) then @@ -1182,15 +1194,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) end do - ! DEBUG begin - time_cholesky = 0 - time_cgprecond = 0 - time_cgprecond_filter = 0 - ncall_cholesky = 0 - ncall_cgprecond = 0 - niter_cgprecond = 0 - ! DEBUG end - !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, & @@ -1278,6 +1281,8 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) "niter_cgprecond", niter_cgprecond(id_thread), & "time_cgprecond_filter", time_cgprecond_filter(id_thread) end do + deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, & + ncall_cholesky, ncall_cgprecond, niter_cgprecond) ! DEBUG end cputime_precond = cputime_precond + get_time(16) From f992fba18eb5bd780f3e9c9d1feb1e25bdae5973 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Mon, 19 Aug 2019 13:34:46 -0700 Subject: [PATCH 03/17] clean output and add barriers --- src/noise_routines.f90 | 32 ++++++++++++++++++-------------- src/smadam_routines.f90 | 39 +++++++++++++++++++++------------------ 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 7d3eae4..08fd834 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -1151,7 +1151,7 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, & num_threads, m, nband, ipsd, isub, niter - real(dp) :: t1, t2, tf + real(dp) :: t1, t2, tf, t_fftw_memory real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() @@ -1189,10 +1189,12 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) else ! Allocate FFTW work space outside the threaded region to avoid ! a GCC compiler segfault when freeing the workspace + t1 = MPI_Wtime() ! DEBUG do id_thread = 0, nthreads - 1 pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) end do + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_alloc", MPI_Wtime() - t1 ! DEBUG !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & @@ -1236,8 +1238,7 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) noba, z(0, kstart+1, idet), r(0, kstart+1, idet), & nband, bandprec(isub, ichunk, idet)%data) ! DEBUG begin - t2 = MPI_Wtime() - time_cholesky(id_thread) = time_cholesky(id_thread) + (t2 - t1) + time_cholesky(id_thread) = time_cholesky(id_thread) + (MPI_Wtime() - t1) ncall_cholesky(id_thread) = ncall_cholesky(id_thread) + 1 ! DEBUG end else @@ -1248,8 +1249,7 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) z(0, kstart+1, idet), r(0, kstart+1, idet), & invcov(1, ipsd), fx, xx, tf, niter) ! DEBUG begin - t2 = MPI_Wtime() - time_cgprecond(id_thread) = time_cgprecond(id_thread) + (t2 - t1) + time_cgprecond(id_thread) = time_cgprecond(id_thread) + (MPI_Wtime() - t1) time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1 @@ -1264,22 +1264,26 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) !$OMP END PARALLEL + t1 = MPI_Wtime() ! DEBUG do id_thread = 0, nthreads - 1 call fftw_free(pxx(id_thread)) call fftw_free(pfx(id_thread)) end do + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_free", MPI_Wtime() - t1 ! DEBUG end if ! DEBUG begin do id_thread = 0, nthreads - 1 - write(1000 + id, *) "step", istep, "thread", id_thread, & - "time_cholesky", time_cholesky(id_thread), & - "ncall_cholesky", ncall_cholesky(id_thread) - write(1000 + id, *) "step", istep, "thread", id_thread, & - "time_cgprecond", time_cgprecond(id_thread), & - "ncall_cgprecond", ncall_cgprecond(id_thread), & - "niter_cgprecond", niter_cgprecond(id_thread), & - "time_cgprecond_filter", time_cgprecond_filter(id_thread) + write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i8)') & + "step", istep, " thread", id_thread, & + " time_cholesky", time_cholesky(id_thread), & + " ncall_cholesky", ncall_cholesky(id_thread) + write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2)') & + "step", istep, " thread", id_thread, & + " time_cgprecond", time_cgprecond(id_thread), & + " ncall_cgprecond", ncall_cgprecond(id_thread), & + " niter_cgprecond", niter_cgprecond(id_thread), & + " time_cgprecond_filter", time_cgprecond_filter(id_thread) end do deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, & ncall_cholesky, ncall_cgprecond, niter_cgprecond) @@ -1432,7 +1436,7 @@ subroutine convolve(x, fc, y) complex(dp) :: fc(nof / 2 + 1) real(dp), intent(out) :: y(noba) real(dp) :: t - xx(:noba) = x - sum(x)/noba + xx(:noba) = x - sum(x) / noba xx(noba + 1:) = 0 t = MPI_Wtime() call dfft(fx, xx) diff --git a/src/smadam_routines.f90 b/src/smadam_routines.f90 index 9defdaa..63a360b 100644 --- a/src/smadam_routines.f90 +++ b/src/smadam_routines.f90 @@ -814,7 +814,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) end if t2 = MPI_Wtime() ! DEBUG - write(1000 + id, *) "step", istep, "baseline_to_map", t2 - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "baseline_to_map", t2 - t1 ! DEBUG ! Communicate maps between processes call wait_mpi @@ -823,18 +823,18 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) wamap = 0 t1 = MPI_Wtime() ! DEBUG call collect_map(wamap, nosubpix_cross, .true.) ! locmap -> wamap - write(1000 + id, *) "step", istep, "collect_map", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "collect_map", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_mpi_reduce = cputime_cga_mpi_reduce + get_time_and_reset(12) ! apply cca, rejects masked pixels t1 = MPI_Wtime() ! DEBUG call ccmultiply(cca, wamap, nopix_cross) - write(1000 + id, *) "step", istep, "ccmultiply", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "ccmultiply", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_cc = cputime_cga_cc + get_time_and_reset(12) t1 = MPI_Wtime() ! DEBUG call scatter_map(wamap, nosubpix_cross) ! wamap -> locmap - write(1000 + id, *) "step", istep, "scatter_map", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "scatter_map", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_mpi_scatter = cputime_cga_mpi_scatter & + get_time_and_reset(12) @@ -856,7 +856,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) end do end if end if - write(1000 + id, *) "step", istep, "cinvmul", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "cinvmul", MPI_Wtime() - t1 ! DEBUG ! From map to baseline @@ -883,35 +883,38 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) ap = ap + sum(ap_all_threads, dim=4) - write(1000 + id, *) "step", istep, "map_to_baseline", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "map_to_baseline", MPI_Wtime() - t1 ! DEBUG cputime_cga_2 = cputime_cga_2 + get_time(12) ! 2) Evaluate p^T.A.p + call wait_mpi ! DEBUB t1 = MPI_Wtime() ! DEBUG - pap = sum(p*ap, mask=rmask) + pap = sum(p * ap, mask=rmask) call sum_mpi(pap) ! 3) alpha = r.z / (p^T.A.p) - alpha = rz/pap + alpha = rz / pap ro = r ! Keep a copy for Polak-Ribiere beta ! 4) update `aa` and `r` aa(:, 1:noba_short, 1:nodetectors) = & - aa(:, 1:noba_short, 1:nodetectors) + alpha*p - r = r - alpha*ap + aa(:, 1:noba_short, 1:nodetectors) + alpha * p + r = r - alpha * ap - write(1000 + id, *) "step", istep, "update_guess", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "update_guess", MPI_Wtime() - t1 ! DEBUG ! 5) Precondition + call wait_mpi ! DEBUB t1 = MPI_Wtime() ! DEBUG call apply_preconditioner(z, r, istep) - write(1000 + id, *) "step", istep, "precondition", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "precondition", MPI_Wtime() - t1 ! DEBUG ! 6) Check for convergence + call wait_mpi ! DEBUB t1 = MPI_Wtime() ! DEBUG rzo = rz rz = sum(r * z, mask=rmask) @@ -922,22 +925,22 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) call sum_mpi(rr) ! This is the Fletcher-Reeves formula that ! assumes stationary preconditioning - !beta = rz / rzo + ! beta = rz / rzo ! This is the Polak-Ribiere formula that ! allows for updates to the preconditioner beta = (rz - rz2) / rzo - write(1000 + id, *) "step", istep, "check_convergence", MPI_Wtime() - t1 ! DEBUG + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "check_convergence", MPI_Wtime() - t1 ! DEBUG if (ID==0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') & - istep, rr/rrinit, rz2/rz, alpha, beta, get_time_and_reset(99) + istep, rr / rrinit, rz2 / rz, alpha, beta, get_time_and_reset(99) - if (rr/rrinit > 1e3) call abort_mpi('CG is diverging') - if (rr/rrinit < cglimit .and. istep > iter_min) exit + if (rr / rrinit > 1e3) call abort_mpi('CG is diverging') + if (rr / rrinit < cglimit .and. istep > iter_min) exit if (rz == 0) exit ! 7) Update search direction, `p` - p = z + beta*p + p = z + beta * p end do From 8c71e1a7d5b685f82405cda28f4fa34c72e7dc7c Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Mon, 19 Aug 2019 14:04:28 -0700 Subject: [PATCH 04/17] time convolution --- src/noise_routines.f90 | 45 +++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 08fd834..53fdc44 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -1151,13 +1151,14 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, & num_threads, m, nband, ipsd, isub, niter - real(dp) :: t1, t2, tf, t_fftw_memory + real(dp) :: t1, t2, tf, t_fftw_memory, tc real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) ! DEBUG begin - real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), time_cgprecond_filter(:) + real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), & + time_cgprecond_filter(:), time_cgprecond_convolve(:) integer, allocatable :: ncall_cholesky(:), ncall_cgprecond(:), niter_cgprecond(:) ! DEBUG end @@ -1171,11 +1172,13 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) ! DEBUG begin allocate(time_cholesky(0:nthreads-1), time_cgprecond(0:nthreads-1), & - time_cgprecond_filter(0:nthreads-1), ncall_cholesky(0:nthreads-1), & + time_cgprecond_filter(0:nthreads-1), time_cgprecond_convolve(0:nthreads-1), & + ncall_cholesky(0:nthreads-1), & ncall_cgprecond(0:nthreads-1), niter_cgprecond(0:nthreads-1)) time_cholesky = 0 time_cgprecond = 0 time_cgprecond_filter = 0 + time_cgprecond_convolve = 0 ncall_cholesky = 0 ncall_cgprecond = 0 niter_cgprecond = 0 @@ -1202,10 +1205,10 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) !$OMP id, nof, nshort, detectors, nthreads, fprecond, nna, & !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx, & !$OMP time_cholesky, ncall_cholesky, time_cgprecond, & - !$OMP ncall_cgprecond, niter_cgprecond, time_cgprecond_filter) & + !$OMP ncall_cgprecond, niter_cgprecond, time_cgprecond_filter, time_cgprecond_convolve) & !$OMP PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, & !$OMP xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, & - !$OMP t1, t2, tf, niter) + !$OMP t1, t2, tf, tc, niter) !t1 = get_time(55) !tf = 0 @@ -1247,10 +1250,11 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) call apply_CG_preconditioner( & noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), & z(0, kstart+1, idet), r(0, kstart+1, idet), & - invcov(1, ipsd), fx, xx, tf, niter) + invcov(1, ipsd), fx, xx, tf, tc, niter) ! DEBUG begin time_cgprecond(id_thread) = time_cgprecond(id_thread) + (MPI_Wtime() - t1) time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf + time_cgprecond_convolve(id_thread) = time_cgprecond_convolve(id_thread) + tc niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1 ! DEBUG end @@ -1278,12 +1282,13 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) "step", istep, " thread", id_thread, & " time_cholesky", time_cholesky(id_thread), & " ncall_cholesky", ncall_cholesky(id_thread) - write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2)') & + write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2,a,x,f8.2)') & "step", istep, " thread", id_thread, & " time_cgprecond", time_cgprecond(id_thread), & " ncall_cgprecond", ncall_cgprecond(id_thread), & " niter_cgprecond", niter_cgprecond(id_thread), & - " time_cgprecond_filter", time_cgprecond_filter(id_thread) + " time_cgprecond_filter", time_cgprecond_filter(id_thread), & + " time_cgprecond_convolve", time_cgprecond_convolve(id_thread) end do deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, & ncall_cholesky, ncall_cgprecond, niter_cgprecond) @@ -1341,7 +1346,7 @@ end subroutine get_cholesky_decomposition subroutine apply_CG_preconditioner( & - noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, niter) + noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, tc, niter) ! ! Apply the preconditioner using CG iteration ! @@ -1352,7 +1357,7 @@ subroutine apply_CG_preconditioner( & real(dp), intent(in) :: b(noba) real(C_DOUBLE), intent(inout) :: xx(nof) complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1) - real(dp), intent(inout) :: tf + real(dp), intent(inout) :: tf, tc integer, intent(inout) :: niter real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:) @@ -1363,7 +1368,7 @@ subroutine apply_CG_preconditioner( & ! Polak-Ribiere formula to allow for a changing preconditioner real(dp), parameter :: cglimit = 1e-6 integer, parameter :: itermax = 100 - real(dp) :: tfilter + real(dp) :: tfilter, tconv ! DEBUG allocate(resid(noba), prop(noba), Aprop(noba), zresid(noba), & precond(noba), stat=ierr) @@ -1382,7 +1387,10 @@ subroutine apply_CG_preconditioner( & prop = zresid iter = 1 + ! DEBUG begin tfilter = 0 + tconv = 0 + ! DEBUG end do call apply_A(prop, Aprop) Anorm = dot_product(prop, Aprop) @@ -1403,8 +1411,11 @@ subroutine apply_CG_preconditioner( & deallocate(resid, prop, Aprop, zresid, precond) + ! DEBUG begin tf = tfilter + tc = tconv niter = iter + ! DEBUG end contains @@ -1435,18 +1446,20 @@ subroutine convolve(x, fc, y) real(dp), intent(in) :: x(noba) complex(dp) :: fc(nof / 2 + 1) real(dp), intent(out) :: y(noba) - real(dp) :: t + real(dp) :: t1, t2 + t1 = MPI_Wtime() ! DEBUG xx(:noba) = x - sum(x) / noba xx(noba + 1:) = 0 - t = MPI_Wtime() + t2 = MPI_Wtime() ! DEBUG call dfft(fx, xx) - tfilter = tfilter + (MPI_Wtime() - t) + tfilter = tfilter + (MPI_Wtime() - t2) ! DEBUG fx = fx * fc - t = MPI_Wtime() + t2 = MPI_Wtime() ! DEBUG call dfftinv(xx, fx) - tfilter = tfilter + (MPI_Wtime() - t) + tfilter = tfilter + (MPI_Wtime() - t2) ! DEBUG y = xx(:noba) y = y - sum(y) / noba + tconv = tconv + (MPI_Wtime() - t1) ! DEBUG end subroutine convolve end subroutine apply_CG_preconditioner From 920676d54c4d3351c2281fe600bef8c32dfa4e21 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Mon, 19 Aug 2019 15:09:20 -0700 Subject: [PATCH 05/17] allocate thread work space only once --- src/noise_routines.f90 | 73 +++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 53fdc44..18d1196 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -41,6 +41,9 @@ MODULE noise_routines type(bandprec_pointer), allocatable :: bandprec(:, :, :) real(dp), allocatable, target :: prec_diag(:, :) + ! Pointers for FFT workspace + type(C_PTR), allocatable :: pxx(:), pfx(:) + ! PSD information integer :: npsdtot @@ -235,6 +238,7 @@ end function psd_index_det SUBROUTINE init_filter integer :: nof_min, ierr + real(dp) :: t1 if (.not. kfilter) return @@ -247,17 +251,30 @@ SUBROUTINE init_filter nof = 1 do if (nof >= nof_min) exit - nof = 2*nof + nof = 2 * nof end do if (ID == 0) write(*,*) 'FFT length =', nof !allocate(fx(nof/2+1), xx(nof), stat=ierr) ! Work space !if (ierr /= 0) call abort_mpi('No room for fx and xx') - memory_filter = ((nof/2+1)*16. + nof*8) * nthreads + memory_filter = ((nof / 2 + 1) * 16. + nof * 8) * nthreads call init_fourier(nof) + ! Permanently allocate space for FFT for each OpenMP thread + + t1 = MPI_Wtime() ! DEBUG + allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr) + if (ierr /= 0) stop 'Failed to allocate pxx and pfx' + pxx = C_NULL_PTR + pfx = C_NULL_PTR + do id_thread = 0, nthreads - 1 + pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) + pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) + end do + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", -1, "fftw_alloc", MPI_Wtime() - t1 ! DEBUG + END SUBROUTINE init_filter @@ -295,6 +312,8 @@ end subroutine free_bandprec SUBROUTINE close_filter + real(dp) :: t1 + if (.not. kfilter) return if (allocated(fcov)) deallocate(fcov) @@ -311,6 +330,14 @@ SUBROUTINE close_filter call close_fourier + t1 = MPI_Wtime() ! DEBUG + do id_thread = 0, nthreads - 1 + call fftw_free(pxx(id_thread)) + call fftw_free(pfx(id_thread)) + end do + deallocate(pxx, pfx) + write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", 999, "fftw_free", MPI_Wtime() - t1 ! DEBUG + nof = -1 END SUBROUTINE close_filter @@ -833,19 +860,11 @@ SUBROUTINE convolve_pp(y, x, fc, nna) real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() - type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) call reset_time(14) y = x - ! Allocate FFTW work space outside the threaded region to avoid - ! a GCC compiler segfault when freeing the workspace - do id_thread = 0, nthreads - 1 - pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) - pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) - end do - !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(nof, nodetectors, ninterval, noba_short_pp, & @@ -870,11 +889,6 @@ SUBROUTINE convolve_pp(y, x, fc, nna) !$OMP END PARALLEL - do id_thread = 0, nthreads - 1 - call fftw_free(pxx(id_thread)) - call fftw_free(pfx(id_thread)) - end do - cputime_filter = cputime_filter + get_time(14) END SUBROUTINE convolve_pp @@ -897,7 +911,6 @@ SUBROUTINE construct_preconditioner(nna) real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() - type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) if (precond_width_max < 1) return @@ -969,13 +982,6 @@ SUBROUTINE construct_preconditioner(nna) call allocate_bandprec - ! Allocate FFTW work space outside the threaded region to avoid - ! a GCC compiler segfault when freeing the workspace - do id_thread = 0, nthreads - 1 - pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) - pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) - end do - !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(nof, npsdtot, fcov, invcov, pxx, pfx) & @@ -996,11 +1002,6 @@ SUBROUTINE construct_preconditioner(nna) !$OMP END PARALLEL - do id_thread = 0, nthreads - 1 - call fftw_free(pxx(id_thread)) - call fftw_free(pfx(id_thread)) - end do - if (.not. use_cgprecond) then nempty = 0 nfail = 0 @@ -1155,7 +1156,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() - type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) ! DEBUG begin real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), & time_cgprecond_filter(:), time_cgprecond_convolve(:) @@ -1190,15 +1190,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) ! Apply the filter-based preconditioner call convolve_pp(z, r, fprecond, nna) else - ! Allocate FFTW work space outside the threaded region to avoid - ! a GCC compiler segfault when freeing the workspace - t1 = MPI_Wtime() ! DEBUG - do id_thread = 0, nthreads - 1 - pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) - pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) - end do - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_alloc", MPI_Wtime() - t1 ! DEBUG - !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, & @@ -1268,12 +1259,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) !$OMP END PARALLEL - t1 = MPI_Wtime() ! DEBUG - do id_thread = 0, nthreads - 1 - call fftw_free(pxx(id_thread)) - call fftw_free(pfx(id_thread)) - end do - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_free", MPI_Wtime() - t1 ! DEBUG end if ! DEBUG begin From a0ba8534712488b2fd3d74e0338e5fb933154583 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Mon, 19 Aug 2019 16:13:46 -0700 Subject: [PATCH 06/17] Remove debug statements --- src/noise_routines.f90 | 93 ++++++----------------------------------- src/smadam_routines.f90 | 23 +--------- 2 files changed, 13 insertions(+), 103 deletions(-) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 18d1196..270a2e1 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -264,7 +264,6 @@ SUBROUTINE init_filter ! Permanently allocate space for FFT for each OpenMP thread - t1 = MPI_Wtime() ! DEBUG allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr) if (ierr /= 0) stop 'Failed to allocate pxx and pfx' pxx = C_NULL_PTR @@ -273,7 +272,6 @@ SUBROUTINE init_filter pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) end do - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", -1, "fftw_alloc", MPI_Wtime() - t1 ! DEBUG END SUBROUTINE init_filter @@ -330,13 +328,11 @@ SUBROUTINE close_filter call close_fourier - t1 = MPI_Wtime() ! DEBUG do id_thread = 0, nthreads - 1 call fftw_free(pxx(id_thread)) call fftw_free(pfx(id_thread)) end do deallocate(pxx, pfx) - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", 999, "fftw_free", MPI_Wtime() - t1 ! DEBUG nof = -1 @@ -1151,16 +1147,11 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) integer, intent(in) :: istep integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, & - num_threads, m, nband, ipsd, isub, niter - real(dp) :: t1, t2, tf, t_fftw_memory, tc + num_threads, m, nband, ipsd, isub + real(dp) :: t1, t2, tf real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() - ! DEBUG begin - real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), & - time_cgprecond_filter(:), time_cgprecond_convolve(:) - integer, allocatable :: ncall_cholesky(:), ncall_cgprecond(:), niter_cgprecond(:) - ! DEBUG end z = r @@ -1170,20 +1161,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) call reset_time(16) - ! DEBUG begin - allocate(time_cholesky(0:nthreads-1), time_cgprecond(0:nthreads-1), & - time_cgprecond_filter(0:nthreads-1), time_cgprecond_convolve(0:nthreads-1), & - ncall_cholesky(0:nthreads-1), & - ncall_cgprecond(0:nthreads-1), niter_cgprecond(0:nthreads-1)) - time_cholesky = 0 - time_cgprecond = 0 - time_cgprecond_filter = 0 - time_cgprecond_convolve = 0 - ncall_cholesky = 0 - ncall_cgprecond = 0 - niter_cgprecond = 0 - ! DEBUG end - if (use_diagonal) then z(0, :, :) = r(0, :, :) * prec_diag else if (use_fprecond) then @@ -1194,12 +1171,10 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) !$OMP DEFAULT(NONE) & !$OMP SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, & !$OMP id, nof, nshort, detectors, nthreads, fprecond, nna, & - !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx, & - !$OMP time_cholesky, ncall_cholesky, time_cgprecond, & - !$OMP ncall_cgprecond, niter_cgprecond, time_cgprecond_filter, time_cgprecond_convolve) & + !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx) & !$OMP PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, & !$OMP xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, & - !$OMP t1, t2, tf, tc, niter) + !$OMP t1, t2, tf) !t1 = get_time(55) !tf = 0 @@ -1226,29 +1201,16 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) ipsd = psd_index(idet, baselines_short_time(kstart+1)) if (allocated(bandprec(isub, ichunk, idet)%data)) then ! Use the precomputed Cholesky decomposition - t1 = MPI_Wtime() ! DEBUG nband = size(bandprec(isub, ichunk, idet)%data, 1) call apply_cholesky_decomposition( & noba, z(0, kstart+1, idet), r(0, kstart+1, idet), & nband, bandprec(isub, ichunk, idet)%data) - ! DEBUG begin - time_cholesky(id_thread) = time_cholesky(id_thread) + (MPI_Wtime() - t1) - ncall_cholesky(id_thread) = ncall_cholesky(id_thread) + 1 - ! DEBUG end else ! Use CG iteration to apply the preconditioner - t1 = MPI_Wtime() ! DEBUG call apply_CG_preconditioner( & noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), & z(0, kstart+1, idet), r(0, kstart+1, idet), & - invcov(1, ipsd), fx, xx, tf, tc, niter) - ! DEBUG begin - time_cgprecond(id_thread) = time_cgprecond(id_thread) + (MPI_Wtime() - t1) - time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf - time_cgprecond_convolve(id_thread) = time_cgprecond_convolve(id_thread) + tc - niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter - ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1 - ! DEBUG end + invcov(1, ipsd), fx, xx, tf) end if end if kstart = kstart + noba @@ -1261,24 +1223,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) end if - ! DEBUG begin - do id_thread = 0, nthreads - 1 - write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i8)') & - "step", istep, " thread", id_thread, & - " time_cholesky", time_cholesky(id_thread), & - " ncall_cholesky", ncall_cholesky(id_thread) - write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2,a,x,f8.2)') & - "step", istep, " thread", id_thread, & - " time_cgprecond", time_cgprecond(id_thread), & - " ncall_cgprecond", ncall_cgprecond(id_thread), & - " niter_cgprecond", niter_cgprecond(id_thread), & - " time_cgprecond_filter", time_cgprecond_filter(id_thread), & - " time_cgprecond_convolve", time_cgprecond_convolve(id_thread) - end do - deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, & - ncall_cholesky, ncall_cgprecond, niter_cgprecond) - ! DEBUG end - cputime_precond = cputime_precond + get_time(16) END SUBROUTINE preconditioning_band @@ -1331,7 +1275,7 @@ end subroutine get_cholesky_decomposition subroutine apply_CG_preconditioner( & - noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, tc, niter) + noba, nof, fcov, nna, x, b, invcov, fx, xx, tfilter) ! ! Apply the preconditioner using CG iteration ! @@ -1342,8 +1286,7 @@ subroutine apply_CG_preconditioner( & real(dp), intent(in) :: b(noba) real(C_DOUBLE), intent(inout) :: xx(nof) complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1) - real(dp), intent(inout) :: tf, tc - integer, intent(inout) :: niter + real(dp), intent(out) :: tfilter real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:) real(dp) :: alpha, beta, rr, rr0, rr_old, rz, rz_old, Anorm @@ -1353,7 +1296,6 @@ subroutine apply_CG_preconditioner( & ! Polak-Ribiere formula to allow for a changing preconditioner real(dp), parameter :: cglimit = 1e-6 integer, parameter :: itermax = 100 - real(dp) :: tfilter, tconv ! DEBUG allocate(resid(noba), prop(noba), Aprop(noba), zresid(noba), & precond(noba), stat=ierr) @@ -1372,10 +1314,7 @@ subroutine apply_CG_preconditioner( & prop = zresid iter = 1 - ! DEBUG begin tfilter = 0 - tconv = 0 - ! DEBUG end do call apply_A(prop, Aprop) Anorm = dot_product(prop, Aprop) @@ -1396,12 +1335,6 @@ subroutine apply_CG_preconditioner( & deallocate(resid, prop, Aprop, zresid, precond) - ! DEBUG begin - tf = tfilter - tc = tconv - niter = iter - ! DEBUG end - contains subroutine apply_precond(x, z) @@ -1431,20 +1364,18 @@ subroutine convolve(x, fc, y) real(dp), intent(in) :: x(noba) complex(dp) :: fc(nof / 2 + 1) real(dp), intent(out) :: y(noba) - real(dp) :: t1, t2 - t1 = MPI_Wtime() ! DEBUG + !real(dp) :: t xx(:noba) = x - sum(x) / noba xx(noba + 1:) = 0 - t2 = MPI_Wtime() ! DEBUG + !t = MPI_Wtime() call dfft(fx, xx) - tfilter = tfilter + (MPI_Wtime() - t2) ! DEBUG + !tfilter = tfilter + (MPI_Wtime() - t) fx = fx * fc - t2 = MPI_Wtime() ! DEBUG + !t = MPI_Wtime() call dfftinv(xx, fx) - tfilter = tfilter + (MPI_Wtime() - t2) ! DEBUG + !tfilter = tfilter + (MPI_Wtime() - t) y = xx(:noba) y = y - sum(y) / noba - tconv = tconv + (MPI_Wtime() - t1) ! DEBUG end subroutine convolve end subroutine apply_CG_preconditioner diff --git a/src/smadam_routines.f90 b/src/smadam_routines.f90 index 63a360b..37686a3 100644 --- a/src/smadam_routines.f90 +++ b/src/smadam_routines.f90 @@ -787,8 +787,6 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) if (istep >= iter_max) exit istep = istep + 1 - t1 = MPI_Wtime() ! DEBUG - call reset_time(12) ! 1) evaluate A.p @@ -813,33 +811,23 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) end if end if - t2 = MPI_Wtime() ! DEBUG - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "baseline_to_map", t2 - t1 ! DEBUG - ! Communicate maps between processes call wait_mpi cputime_cga_1 = cputime_cga_1 + get_time_and_reset(12) wamap = 0 - t1 = MPI_Wtime() ! DEBUG call collect_map(wamap, nosubpix_cross, .true.) ! locmap -> wamap - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "collect_map", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_mpi_reduce = cputime_cga_mpi_reduce + get_time_and_reset(12) ! apply cca, rejects masked pixels - t1 = MPI_Wtime() ! DEBUG call ccmultiply(cca, wamap, nopix_cross) - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "ccmultiply", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_cc = cputime_cga_cc + get_time_and_reset(12) - t1 = MPI_Wtime() ! DEBUG call scatter_map(wamap, nosubpix_cross) ! wamap -> locmap - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "scatter_map", MPI_Wtime() - t1 ! DEBUG call wait_mpi cputime_cga_mpi_scatter = cputime_cga_mpi_scatter & + get_time_and_reset(12) - t1 = MPI_Wtime() ! DEBUG if (kfilter) then call cinvmul(ap, p, nna) else @@ -856,13 +844,11 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) end do end if end if - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "cinvmul", MPI_Wtime() - t1 ! DEBUG ! From map to baseline call reset_time(12) - t1 = MPI_Wtime() ! DEBUG ap_all_threads = 0 if (basis_order == 0) then @@ -883,13 +869,11 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) ap = ap + sum(ap_all_threads, dim=4) - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "map_to_baseline", MPI_Wtime() - t1 ! DEBUG cputime_cga_2 = cputime_cga_2 + get_time(12) ! 2) Evaluate p^T.A.p call wait_mpi ! DEBUB - t1 = MPI_Wtime() ! DEBUG pap = sum(p * ap, mask=rmask) call sum_mpi(pap) @@ -904,18 +888,14 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) aa(:, 1:noba_short, 1:nodetectors) + alpha * p r = r - alpha * ap - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "update_guess", MPI_Wtime() - t1 ! DEBUG ! 5) Precondition call wait_mpi ! DEBUB - t1 = MPI_Wtime() ! DEBUG call apply_preconditioner(z, r, istep) - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "precondition", MPI_Wtime() - t1 ! DEBUG ! 6) Check for convergence call wait_mpi ! DEBUB - t1 = MPI_Wtime() ! DEBUG rzo = rz rz = sum(r * z, mask=rmask) rz2 = sum(ro * z, mask=rmask) @@ -929,9 +909,8 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) ! This is the Polak-Ribiere formula that ! allows for updates to the preconditioner beta = (rz - rz2) / rzo - write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "check_convergence", MPI_Wtime() - t1 ! DEBUG - if (ID==0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') & + if (ID == 0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') & istep, rr / rrinit, rz2 / rz, alpha, beta, get_time_and_reset(99) if (rr / rrinit > 1e3) call abort_mpi('CG is diverging') From e14dbf5f2c194728dfd62adf92f8069bba1d3456 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Wed, 21 Aug 2019 10:21:50 -0700 Subject: [PATCH 07/17] initialize filter in a separate call --- src/noise_routines.f90 | 3 +-- src/smadam.F90 | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 270a2e1..c2e9dfe 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -241,6 +241,7 @@ SUBROUTINE init_filter real(dp) :: t1 if (.not. kfilter) return + if (nof >= 0) return ! Filter already initialized nofilter = 2 * noba_short_pp_max @@ -358,8 +359,6 @@ SUBROUTINE build_filter() if (info == 3 .and. ID == 0) write(*,*) 'Building filter...' if (info > 4) write(*,*) 'Building filter...' - if (nof < 0) call init_filter - npsdtot = 0 do idet = 1, nodetectors npsdtot = npsdtot + detectors(idet)%npsd diff --git a/src/smadam.F90 b/src/smadam.F90 index 2a04be1..7c49527 100644 --- a/src/smadam.F90 +++ b/src/smadam.F90 @@ -254,6 +254,10 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & cputime_init = cputime_init + get_time(1) + call tic + call init_filter + if (id == 0) call toc('init_filter') + call tic call build_filter if (id == 0) call toc('build_filter') From bca5532b981acd77c8be4faabbd9af0f082ed752 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Wed, 21 Aug 2019 12:29:51 -0700 Subject: [PATCH 08/17] use unaligned FFT in Madam --- src/commonparam.f90 | 1 + src/fourier_fftw_2003.f90 | 7 +- src/noise_routines.f90 | 194 ++++++++++++++++++++++---------------- 3 files changed, 120 insertions(+), 82 deletions(-) diff --git a/src/commonparam.f90 b/src/commonparam.f90 index ba4f403..991e1f4 100644 --- a/src/commonparam.f90 +++ b/src/commonparam.f90 @@ -130,6 +130,7 @@ MODULE commonparam real(dp) :: dnshort=-1 integer :: nlong=-1, nshort=-1 logical :: kfirst=.true., kfilter=.false. + logical :: unaligned_fft=.true. real(dp) :: cglimit=1.d-12 integer :: iter_min=3, iter_max=1000 diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90 index d8cb432..e99b37c 100644 --- a/src/fourier_fftw_2003.f90 +++ b/src/fourier_fftw_2003.f90 @@ -21,9 +21,10 @@ MODULE fourier CONTAINS - SUBROUTINE init_fourier(nof_in) + SUBROUTINE init_fourier(nof_in, unaligned) integer, intent(in) :: nof_in + logical, intent(in) :: unaligned integer :: fftw_planning_strategy real(C_DOUBLE), pointer :: in(:) complex(C_DOUBLE_COMPLEX), pointer :: out(:) @@ -63,7 +64,9 @@ SUBROUTINE init_fourier(nof_in) fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT) !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_PRESERVE_INPUT) - !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_UNALIGNED) + if (unaligned) then + fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_UNALIGNED) + end if plan = fftw_plan_dft_r2c_1d(nof, in, out, fftw_planning_strategy) plan_inv = fftw_plan_dft_c2r_1d(nof, out, in, fftw_planning_strategy) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index c2e9dfe..0edd40c 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -43,6 +43,8 @@ MODULE noise_routines ! Pointers for FFT workspace type(C_PTR), allocatable :: pxx(:), pfx(:) + real(C_DOUBLE), allocatable, target :: xxthreads(:, :) + complex(C_DOUBLE_COMPLEX), allocatable, target :: fxthreads(:, :) ! PSD information @@ -165,18 +167,18 @@ subroutine interpolate_psd(freq, psd, targetfreq, targetpsd) end if flog = log(targetfreq(i)) if (j < n_in - 1) then - do while (logfreq(j+1) < flog) + do while (logfreq(j + 1) < flog) j = j + 1 if (j == n_in - 1) exit end do end if - if (psd(j) > 0 .and. psd(j+1) > 0) then + if (psd(j) > 0 .and. psd(j + 1) > 0) then ! Logarithmic interpolation - r = (flog - logfreq(j)) / (logfreq(j+1) - logfreq(j)) + r = (flog - logfreq(j)) / (logfreq(j + 1) - logfreq(j)) targetpsd(i) = exp(logpsd(j) * (1._dp - r) + logpsd(j + 1) * r) else ! Linear interpolation - r = (targetfreq(i) - freq(j)) / (freq(j+1) - freq(j)) + r = (targetfreq(i) - freq(j)) / (freq(j + 1) - freq(j)) targetpsd(i) = psd(j) * (1._dp - r) + psd(j + 1) * r end if end do @@ -223,7 +225,7 @@ function psd_index_det(idet, starttime) result(ipsd) if (detectors(idet)%psdstarts(ipsd) > starttime) ipsd = -1 return end if - if (detectors(idet)%psdstarts(ipsd+1) > starttime) exit + if (detectors(idet)%psdstarts(ipsd + 1) > starttime) exit ipsd = ipsd + 1 end do @@ -243,6 +245,8 @@ SUBROUTINE init_filter if (.not. kfilter) return if (nof >= 0) return ! Filter already initialized + if (info == 3 .and. ID == 0) write(*,*) 'Initializing filter...' + nofilter = 2 * noba_short_pp_max nof_min = nofilter @@ -256,23 +260,29 @@ SUBROUTINE init_filter end do if (ID == 0) write(*,*) 'FFT length =', nof - !allocate(fx(nof/2+1), xx(nof), stat=ierr) ! Work space + !allocate(fx(nof / 2 + 1), xx(nof), stat=ierr) ! Work space !if (ierr /= 0) call abort_mpi('No room for fx and xx') memory_filter = ((nof / 2 + 1) * 16. + nof * 8) * nthreads - call init_fourier(nof) + call init_fourier(nof, unaligned_fft) ! Permanently allocate space for FFT for each OpenMP thread - allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr) - if (ierr /= 0) stop 'Failed to allocate pxx and pfx' - pxx = C_NULL_PTR - pfx = C_NULL_PTR - do id_thread = 0, nthreads - 1 - pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) - pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) - end do + if (unaligned_fft) then + allocate(xxthreads(nof, 0:nthreads - 1), & + fxthreads(nof / 2 + 1, 0:nthreads - 1), stat=ierr) + if (ierr /= 0) stop 'Failed to allocate xxthreads and fxthreads' + else + allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr) + if (ierr /= 0) stop 'Failed to allocate pxx and pfx' + pxx = C_NULL_PTR + pfx = C_NULL_PTR + do id_thread = 0, nthreads - 1 + pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) + pfx(id_thread) = fftw_alloc_complex(int(nof / 2 + 1, C_SIZE_T)) + end do + end if END SUBROUTINE init_filter @@ -329,11 +339,17 @@ SUBROUTINE close_filter call close_fourier - do id_thread = 0, nthreads - 1 - call fftw_free(pxx(id_thread)) - call fftw_free(pfx(id_thread)) - end do - deallocate(pxx, pfx) + if (unaligned_fft) then + if (allocated(xxthreads)) deallocate(xxthreads, fxthreads) + else + if (allocated(pxx)) then + do id_thread = 0, nthreads - 1 + call fftw_free(pxx(id_thread)) + call fftw_free(pfx(id_thread)) + end do + deallocate(pxx, pfx) + end if + end if nof = -1 @@ -366,13 +382,13 @@ SUBROUTINE build_filter() ! Construct and store the filters if (allocated(fcov)) deallocate(fcov) - allocate(fcov(nof/2+1, npsdtot), & ! Fourier inverse of the noise filter + allocate(fcov(nof / 2 + 1, npsdtot), & ! Fourier inverse of the noise filter psddet(npsdtot), psdind(npsdtot), psdstart(npsdtot), psdstop(npsdtot), & stat=ierr) if (ierr /= 0) call abort_mpi('No room for fcov') - memory_filter = memory_filter + (nof/2+1)*npsdtot*16. + npsdtot*24. + memory_filter = memory_filter + (nof / 2 + 1)*npsdtot*16. + npsdtot*24. - memsum = memory_filter / 2**20 + memsum = memory_filter / 2 ** 20 mem_min = memsum; mem_max = memsum call min_mpi(mem_min); call max_mpi(mem_max) call sum_mpi(memsum) @@ -396,7 +412,7 @@ SUBROUTINE build_filter() id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - allocate(aspec(nof/2+1), spectrum(nof), f(nof), g(nof), stat=ierr) + allocate(aspec(nof / 2 + 1), spectrum(nof), f(nof), g(nof), stat=ierr) if (ierr /= 0) call abort_mpi('No room for aspec') ipsdtot = 0 @@ -415,7 +431,7 @@ SUBROUTINE build_filter() psdind(ipsdtot) = ipsd psdstart(ipsdtot) = detectors(idet)%psdstarts(ipsd) if (ipsd < detectors(idet)%npsd) then - psdstop(ipsdtot) = detectors(idet)%psdstarts(ipsd+1) + psdstop(ipsdtot) = detectors(idet)%psdstarts(ipsd + 1) else psdstop(ipsdtot) = 1e30 end if @@ -436,23 +452,23 @@ SUBROUTINE build_filter() end if do i = 1, nof - x = pi*f(i)/fa + x = pi * f(i) / fa if (x < 1.e-30) then - g(i) = 1.d0 + g(i) = 1 else - g(i) = sin(x)**2/x**2 + g(i) = sin(x) ** 2 / x ** 2 end if end do if (k == 0) then aspec(1) = spectrum(1) else - aspec(1) = aspec(1) + 2*spectrum(1)*g(1) + aspec(1) = aspec(1) + 2 * spectrum(1) * g(1) end if - do i = 2, nof/2+1 - aspec(i) = aspec(i) + spectrum(i)*g(i) & - + spectrum(nof-i+2)*g(nof-i+2) + do i = 2, nof / 2 + 1 + aspec(i) = aspec(i) + spectrum(i) * g(i) & + + spectrum(nof - i + 2) * g(nof - i + 2) end do end do @@ -510,7 +526,7 @@ SUBROUTINE read_spectrum() end if allocate(ftable(nolines), spectrum_table(nolines, nodetectors), & - dtemp(nocols+1, nolines), stemp(nocols), sigmas(nocols), & + dtemp(nocols + 1, nolines), stemp(nocols), sigmas(nocols), & stat=ierr) if (ierr /= 0) call abort_mpi('No room to read the noise spectra') @@ -540,7 +556,7 @@ SUBROUTINE read_spectrum() do idet = 1, nodetectors ! find the column of the read table that corresponds to ! detector # idet - do icol = 1, nocols+1 + do icol = 1, nocols + 1 if (icol > nocols) then call abort_mpi('ERROR: ' // trim(detectors(idet)%name) // & ' not in ' // trim(file_spectrum)) @@ -554,7 +570,7 @@ SUBROUTINE read_spectrum() ! prepare for logaritmic interpolation ! first column was the frequency - spectrum_table(:, idet) = log(dtemp(icol+1, :)) + spectrum_table(:, idet) = log(dtemp(icol + 1, :)) end do deallocate(stemp, dtemp) @@ -564,7 +580,7 @@ SUBROUTINE read_spectrum() call broadcast_mpi(nolines, 0) do idet = 1, nodetectors call broadcast_mpi(detectors(idet)%sigmas, detectors(idet)%npsd, 0) - detectors(idet)%weights = 1 / detectors(idet)%sigmas**2 + detectors(idet)%weights = 1 / detectors(idet)%sigmas ** 2 end do if (id /= 0) then @@ -600,16 +616,16 @@ SUBROUTINE get_spectrum_file() logf = log(f(i)) do - if (logf <= ftable(k+1) .or. k == nolines-1) exit + if (logf <= ftable(k + 1) .or. k == nolines-1) exit k = k + 1 end do - p = (logf-ftable(k)) / (ftable(k+1)-ftable(k)) + p = (logf - ftable(k)) / (ftable(k + 1) - ftable(k)) if (p < 0) p = 0 if (p > 1) p = 1 - logspec = (1.d0-p)*spectrum_table(k, icol) + p*spectrum_table(k+1, icol) + logspec = (1.d0 - p) * spectrum_table(k, icol) + p * spectrum_table(k + 1, icol) spectrum(i) = exp(logspec) ! *sigma*sigma/fsample -RK end do @@ -637,7 +653,7 @@ SUBROUTINE get_spectrum_interp(idet, ipsd, f, spectrum) ! the sigmas have already been updated from the PSDs - !plateau = detectors(idet)%sigmas(ipsd)**2 / fsample + !plateau = detectors(idet)%sigmas(ipsd) ** 2 / fsample plateau = detectors(idet)%plateaus(ipsd) else @@ -695,7 +711,7 @@ SUBROUTINE get_spectrum_interp(idet, ipsd, f, spectrum) loop_bins : do i = 1, nof if (spectrum(i) <= 0) then ! First look for valid value in higher frequency - loop_next_bins: do j = i+1, nof + loop_next_bins: do j = i + 1, nof if (spectrum(j) > 0) then spectrum(i) = spectrum(j) cycle loop_bins @@ -755,7 +771,7 @@ subroutine trim_interval(kstart, kstop, noba, idet, nna) nna(0:basis_order, 0:basis_order, noba_short, nodetectors) integer :: gapmin, gaplen, i - if (all(nna(:, :, kstart+1:kstop, idet) == 0)) then + if (all(nna(:, :, kstart + 1:kstop, idet) == 0)) then noba = 0 return end if @@ -763,7 +779,7 @@ subroutine trim_interval(kstart, kstop, noba, idet, nna) ! trim the beginning do while (kstart < kstop) - if (any(nna(:, :, kstart+1, idet) /= 0)) exit + if (any(nna(:, :, kstart + 1, idet) /= 0)) exit kstart = kstart + 1 end do @@ -799,19 +815,19 @@ subroutine convolve_interval(ichunk, idet, x, y, xx, fx, fc, nna) integer, intent(in) :: ichunk, idet real(dp), intent(out) :: y(0:basis_order, noba_short, nodetectors) real(dp), intent(in) :: x(0:basis_order, noba_short, nodetectors) - complex(dp), intent(in) :: fc(nof/2 + 1, npsdtot) + complex(dp), intent(in) :: fc(nof / 2 + 1, npsdtot) real(dp), intent(in) :: & nna(0:basis_order, 0:basis_order, noba_short, nodetectors) real(C_DOUBLE) :: xx(nof) - complex(C_DOUBLE_COMPLEX) :: fx(nof/2 + 1) + complex(C_DOUBLE_COMPLEX) :: fx(nof / 2 + 1) integer :: noba, kstart, kstop, ipsd kstart = sum(noba_short_pp(1:ichunk-1)) kstop = kstart + noba_short_pp(ichunk) - ipsd = psd_index(idet, baselines_short_time(kstart+1)) + ipsd = psd_index(idet, baselines_short_time(kstart + 1)) - y(0, kstart+1:kstop, idet) = x(0, kstart+1:kstop, idet) + y(0, kstart + 1:kstop, idet) = x(0, kstart + 1:kstop, idet) if (ipsd == -1) then ! no PSD @@ -824,14 +840,14 @@ subroutine convolve_interval(ichunk, idet, x, y, xx, fx, fc, nna) call trim_interval(kstart, kstop, noba, idet, nna) if (noba == 0) exit - xx(1:noba) = x(0, kstart+1:kstart+noba, idet) + xx(1:noba) = x(0, kstart + 1:kstart+noba, idet) xx(1:noba) = xx(1:noba) - sum(xx(1:noba)) / noba - xx(noba+1:) = 0 + xx(noba + 1:) = 0 call dfft(fx, xx) fx = fx * fc(:, ipsd) call dfftinv(xx, fx) xx(1:noba) = xx(1:noba) - sum(xx(1:noba)) / noba - y(0, kstart+1:kstart+noba, idet) = xx(1:noba) + y(0, kstart + 1:kstart+noba, idet) = xx(1:noba) kstart = kstart + noba end do @@ -847,7 +863,7 @@ SUBROUTINE convolve_pp(y, x, fc, nna) real(dp), intent(out) :: y(noba_short, nodetectors) real(dp), intent(in) :: x(noba_short, nodetectors) - complex(dp), intent(in) :: fc(nof/2+1, npsdtot) + complex(dp), intent(in) :: fc(nof / 2 + 1, npsdtot) real(dp), intent(in) :: & nna(0:basis_order, 0:basis_order, noba_short, nodetectors) @@ -863,15 +879,21 @@ SUBROUTINE convolve_pp(y, x, fc, nna) !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(nof, nodetectors, ninterval, noba_short_pp, & - !$OMP baselines_short_time, fc, x, y, nthreads, nna, pxx, pfx) & + !$OMP baselines_short_time, fc, x, y, nthreads, nna, pxx, pfx, & + !$OMP unaligned_fft, xxthreads, fxthreads) & !$OMP PRIVATE(idet, ichunk, m, xx, fx, ierr, id_thread, itask, & !$OMP num_threads) id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - call c_f_pointer(pxx(id_thread), xx, [nof]) - call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1]) + if (unaligned_fft) then + xx => xxthreads(:, id_thread) + fx => fxthreads(:, id_thread) + else + call c_f_pointer(pxx(id_thread), xx, [nof]) + call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1]) + end if itask = -1 do idet = 1, nodetectors @@ -929,15 +951,15 @@ SUBROUTINE construct_preconditioner(nna) do ichunk = 1, ninterval kstart = sum(noba_short_pp(1:ichunk-1)) noba = noba_short_pp(ichunk) - ipsddet = psd_index_det(idet, baselines_short_time(kstart+1)) + ipsddet = psd_index_det(idet, baselines_short_time(kstart + 1)) if (ipsddet < 0) then ! No PSD available - do k = kstart+1, kstart+noba + do k = kstart + 1, kstart+noba prec_diag(k, idet) = 1. end do cycle end if - do k = kstart+1, kstart+noba + do k = kstart + 1, kstart+noba if (nna(0, 0, k, idet) == 0) then prec_diag(k, idet) = 1. / detectors(idet)%weights(ipsddet) else @@ -951,10 +973,10 @@ SUBROUTINE construct_preconditioner(nna) ! Build fprecond to act as a fall back option if (allocated(fprecond)) deallocate(fprecond) - allocate(fprecond(nof/2+1, npsdtot), stat=ierr) + allocate(fprecond(nof / 2 + 1, npsdtot), stat=ierr) if (ierr /= 0) call abort_mpi('No room for fprecond') fprecond = 0 - memory_precond = memory_precond + (nof/2+1)*npsdtot*16. + memory_precond = memory_precond + (nof / 2 + 1)*npsdtot*16. ipsd = 0 do idet = 1, nodetectors do ipsddet = 1, detectors(idet)%npsd @@ -979,14 +1001,20 @@ SUBROUTINE construct_preconditioner(nna) !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & - !$OMP SHARED(nof, npsdtot, fcov, invcov, pxx, pfx) & + !$OMP SHARED(nof, npsdtot, fcov, invcov, pxx, pfx, & + !$OMP unaligned_fft, xxthreads, fxthreads) & !$OMP PRIVATE(id_thread, num_threads, ipsd, xx, fx) id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - call c_f_pointer(pxx(id_thread), xx, [nof]) - call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1]) + if (unaligned_fft) then + xx => xxthreads(:, id_thread) + fx => fxthreads(:, id_thread) + else + call c_f_pointer(pxx(id_thread), xx, [nof]) + call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1]) + end if do ipsd = 1, npsdtot if (modulo(ipsd-1, num_threads) /= id_thread) cycle @@ -1030,10 +1058,10 @@ SUBROUTINE construct_preconditioner(nna) loop_subchunk : do call trim_interval(kstart, kstop, noba, idet, nna) if (noba == 0) exit - ipsd = psd_index(idet, baselines_short_time(kstart+1)) + ipsd = psd_index(idet, baselines_short_time(kstart + 1)) if (ipsd == -1 .or. noba < 1 .or. & - all(nna(0, 0, kstart+1:kstart+noba, idet) == 0)) then + all(nna(0, 0, kstart + 1:kstart+noba, idet) == 0)) then nempty = nempty + 1 cycle end if @@ -1041,15 +1069,15 @@ SUBROUTINE construct_preconditioner(nna) ! Rough guess for the width of the preconditioner based on ! total power in the band - p_tot = sum(invcov(:nof/2, ipsd)**2) + p_tot = sum(invcov(:nof / 2, ipsd) ** 2) try = 1 ierr = -1 loop_try : do - ! Rows from kstart+1 to kstart+noba for one band-diagonal + ! Rows from kstart + 1 to kstart+noba for one band-diagonal ! submatrix. Do the computation in double precision nband = min(noba, try * precond_width_min) nband = min(nband, precond_width_max) - p = sum(invcov(:nband, ipsd)**2) + p = sum(invcov(:nband, ipsd) ** 2) if (p > plim * p_tot) then ierr = 0 exit @@ -1064,7 +1092,7 @@ SUBROUTINE construct_preconditioner(nna) do call get_cholesky_decomposition( & bandprec(isub, ichunk, idet)%data, nband, noba, & - invcov(1, ipsd), nna(0, 0, kstart+1, idet), ierr) + invcov(1, ipsd), nna(0, 0, kstart + 1, idet), ierr) if (ierr == 0) exit @@ -1084,7 +1112,7 @@ SUBROUTINE construct_preconditioner(nna) !write (*, '(1x,a,i0,a,es18.10,a,es18.10)') & ! 'Cholesky decomposition failed for ' // & ! trim(detectors(idet)%name) // ', noba = ', noba, ', t = ', & - ! baselines_short_time(kstart+1), ' - ', & + ! baselines_short_time(kstart + 1), ' - ', & ! baselines_short_time(kstart+noba) nfail = nfail + 1 else @@ -1114,7 +1142,7 @@ SUBROUTINE construct_preconditioner(nna) end if - memsum = memory_precond / 2**20 + memsum = memory_precond / 2 ** 20 mem_min = memsum; mem_max = memsum call min_mpi(mem_min); call max_mpi(mem_max) call sum_mpi(memsum) @@ -1170,7 +1198,8 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) !$OMP DEFAULT(NONE) & !$OMP SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, & !$OMP id, nof, nshort, detectors, nthreads, fprecond, nna, & - !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx) & + !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx, & + !$OMP unaligned_fft, xxthreads, fxthreads) & !$OMP PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, & !$OMP xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, & !$OMP t1, t2, tf) @@ -1180,8 +1209,13 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - call c_f_pointer(pxx(id_thread), xx, [nof]) - call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1]) + if (unaligned_fft) then + xx => xxthreads(:, id_thread) + fx => fxthreads(:, id_thread) + else + call c_f_pointer(pxx(id_thread), xx, [nof]) + call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1]) + end if itask = -1 do idet = 1, nodetectors @@ -1194,21 +1228,21 @@ SUBROUTINE preconditioning_band(z, r, nna, istep) if (noba == 0) exit loop_subchunk itask = itask + 1 if (modulo(itask, num_threads) == id_thread) then - ipsd = psd_index_det(idet, baselines_short_time(kstart+1)) + ipsd = psd_index_det(idet, baselines_short_time(kstart + 1)) if (ipsd < 0) cycle if (detectors(idet)%weights(ipsd) == 0) cycle - ipsd = psd_index(idet, baselines_short_time(kstart+1)) + ipsd = psd_index(idet, baselines_short_time(kstart + 1)) if (allocated(bandprec(isub, ichunk, idet)%data)) then ! Use the precomputed Cholesky decomposition nband = size(bandprec(isub, ichunk, idet)%data, 1) call apply_cholesky_decomposition( & - noba, z(0, kstart+1, idet), r(0, kstart+1, idet), & + noba, z(0, kstart + 1, idet), r(0, kstart + 1, idet), & nband, bandprec(isub, ichunk, idet)%data) else ! Use CG iteration to apply the preconditioner call apply_CG_preconditioner( & - noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), & - z(0, kstart+1, idet), r(0, kstart+1, idet), & + noba, nof, fcov(1, ipsd), nna(0, 0, kstart + 1, idet), & + z(0, kstart + 1, idet), r(0, kstart + 1, idet), & invcov(1, ipsd), fx, xx, tf) end if end if @@ -1279,12 +1313,12 @@ subroutine apply_CG_preconditioner( & ! Apply the preconditioner using CG iteration ! integer, intent(in) :: noba, nof - complex(dp), intent(in) :: fcov(nof/2+1) + complex(dp), intent(in) :: fcov(nof / 2 + 1) real(dp), intent(in) :: nna(noba), invcov(noba) real(dp), intent(out) :: x(noba) real(dp), intent(in) :: b(noba) real(C_DOUBLE), intent(inout) :: xx(nof) - complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1) + complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof / 2 + 1) real(dp), intent(out) :: tfilter real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:) From 4252a4a083a35571a354fdcebfaf4a45c31f8378 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Wed, 21 Aug 2019 13:12:46 -0700 Subject: [PATCH 09/17] add option to toggle unaligned FFT --- src/inputparam.f90 | 2 ++ src/parameter_control.f90 | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/inputparam.f90 b/src/inputparam.f90 index 9e79008..90c0cbd 100644 --- a/src/inputparam.f90 +++ b/src/inputparam.f90 @@ -294,6 +294,8 @@ SUBROUTINE read_line(line) read(value, *, iostat=ierr) kfilter case ('diagfilter') read(value, *, iostat=ierr) diagfilter + case ('unaligned_fft') + read(value, *, iostat=ierr) unaligned_fft case ('precond_width_min') read(value, *, iostat=ierr) precond_width_min case ('precond_width_max') diff --git a/src/parameter_control.f90 b/src/parameter_control.f90 index c1170e6..49acbfe 100644 --- a/src/parameter_control.f90 +++ b/src/parameter_control.f90 @@ -620,12 +620,13 @@ SUBROUTINE write_parameters() if (kfirst) then write (*,*) - write (*,fk) 'kfirst',kfirst, 'First destriping ON' - write (*,ff) 'dnshort',dnshort,'Baseline length (samples)' + write (*,fk) 'kfirst', kfirst, 'First destriping ON' + write (*,ff) 'dnshort', dnshort, 'Baseline length (samples)' write (*,ff) ' ',dnshort/fsample,'seconds' if (kfilter) then - write (*,fk) 'kfilter',kfilter,'Noise filter ON' + write (*,fk) 'kfilter', kfilter, 'Noise filter ON' + write (*,fk) 'unaligned_fft', unaligned_fft else write (*,fk) 'kfilter', kfilter, 'Noise filter OFF' write (*,fe) 'diagfilter', diagfilter, 'diagonal baseline filter' From d3985b3dbe8bf95149a0442f8ed3bf25a2f4ac2e Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Thu, 22 Aug 2019 15:27:45 -0700 Subject: [PATCH 10/17] disable system wisdom loadimg --- src/fourier_fftw_2003.f90 | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90 index e99b37c..6942bff 100644 --- a/src/fourier_fftw_2003.f90 +++ b/src/fourier_fftw_2003.f90 @@ -46,15 +46,15 @@ SUBROUTINE init_fourier(nof_in, unaligned) call c_f_pointer(pin, in, [nof]) call c_f_pointer(pout, out, [nof/2 + 1]) - ierr = fftw_import_system_wisdom() - - if (info /= 0) then - if (ierr /= 0) then - write (*,*) 'FFTW: System wide wisdom loaded' - else - write (*,*) 'FFTW: Unable to load system wisdom' - end if - end if +!!$ ierr = fftw_import_system_wisdom() +!!$ +!!$ if (info /= 0) then +!!$ if (ierr /= 0) then +!!$ write (*,*) 'FFTW: System wide wisdom loaded' +!!$ else +!!$ write (*,*) 'FFTW: Unable to load system wisdom' +!!$ end if +!!$ end if !if (nof <= 65536) then fftw_planning_strategy = FFTW_MEASURE From f565341e49ea94f95e06964d90c775d36efb2e4e Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Fri, 30 Aug 2019 14:33:29 -0700 Subject: [PATCH 11/17] time filter initialization better --- src/fourier_fftw_2003.f90 | 4 ++-- src/noise_routines.f90 | 20 ++++++++++++++------ src/smadam.F90 | 9 +++++++++ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90 index 6942bff..6f9e363 100644 --- a/src/fourier_fftw_2003.f90 +++ b/src/fourier_fftw_2003.f90 @@ -57,9 +57,9 @@ SUBROUTINE init_fourier(nof_in, unaligned) !!$ end if !if (nof <= 65536) then - fftw_planning_strategy = FFTW_MEASURE + !fftw_planning_strategy = FFTW_MEASURE !else - ! fftw_planning_strategy = FFTW_ESTIMATE + fftw_planning_strategy = FFTW_ESTIMATE !end if fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT) diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 0edd40c..8d0228f 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -19,10 +19,11 @@ MODULE noise_routines real(dp), save, public :: memory_filter = 0, memory_precond = 0, & cputime_filter = 0, cputime_precond = 0, & - cputime_prec_construct = 0 + cputime_prec_construct = 0, & + cputime_filter_allocate = 0, & + cputime_filter_init = 0, & + cputime_filter_build = 0 - !real(dp), allocatable :: xx(:) - !complex(dp), allocatable :: fx(:) complex(dp), allocatable, target :: fcov(:, :), fprecond(:, :) real(dp), allocatable :: invcov(:, :) @@ -260,13 +261,14 @@ SUBROUTINE init_filter end do if (ID == 0) write(*,*) 'FFT length =', nof - !allocate(fx(nof / 2 + 1), xx(nof), stat=ierr) ! Work space - !if (ierr /= 0) call abort_mpi('No room for fx and xx') - memory_filter = ((nof / 2 + 1) * 16. + nof * 8) * nthreads + call reset_time(16) + call init_fourier(nof, unaligned_fft) + cputime_filter_init = cputime_filter_init + get_time_and_reset(16) + ! Permanently allocate space for FFT for each OpenMP thread if (unaligned_fft) then @@ -284,6 +286,8 @@ SUBROUTINE init_filter end do end if + cputime_filter_allocate = cputime_filter_allocate + get_time(16) + END SUBROUTINE init_filter @@ -375,6 +379,8 @@ SUBROUTINE build_filter() if (info == 3 .and. ID == 0) write(*,*) 'Building filter...' if (info > 4) write(*,*) 'Building filter...' + call reset_time(16) + npsdtot = 0 do idet = 1, nodetectors npsdtot = npsdtot + detectors(idet)%npsd @@ -496,6 +502,8 @@ SUBROUTINE build_filter() if (kread_file) deallocate(ftable, spectrum_table) + cputime_filter_build = cputime_filter_build + get_time(16) + if (info > 4) write(*,*) 'Done' CONTAINS diff --git a/src/smadam.F90 b/src/smadam.F90 index 7c49527..038c807 100644 --- a/src/smadam.F90 +++ b/src/smadam.F90 @@ -499,6 +499,9 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & call write_time('Binning TOD', cputime_bin_maps) call write_time('Sending binned TOD', cputime_send_maps) call write_time('Counting hits', cputime_count_hits) + call write_time('Filter - allocate', cputime_filter_allocate) + call write_time('Filter - initialize', cputime_filter_init) + call write_time('Filter - build', cputime_filter_build) call write_time('Building preconditioner', cputime_prec_construct) call write_time('Subtract/add baselines', & cputime_clean_tod+cputime_unclean_tod) @@ -863,6 +866,9 @@ subroutine destripe_with_cache(comm, ndet, nsamp, nnz, & call write_time('Binning TOD', cputime_bin_maps) call write_time('Sending binned TOD', cputime_send_maps) call write_time('Counting hits', cputime_count_hits) + call write_time('Filter - allocate', cputime_filter_allocate) + call write_time('Filter - initialize', cputime_filter_init) + call write_time('Filter - build', cputime_filter_build) call write_time('Building preconditioner', cputime_prec_construct) call write_time('Subtract/add baselines', & cputime_clean_tod + cputime_unclean_tod) @@ -1354,6 +1360,9 @@ subroutine reset_timers() cputime_send_maps = 0 cputime_count_hits = 0 cputime_prec_construct = 0 + cputime_filter_allocate = 0 + cputime_filter_init = 0 + cputime_filter_build = 0 cputime_cga_init = 0 cputime_cga = 0 From e4d6ef193a39400d37178dc6f0a7e34f77d86e3a Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Fri, 30 Aug 2019 16:34:02 -0700 Subject: [PATCH 12/17] make memory reporting more helpful --- src/maps_and_baselines.f90 | 23 +----- src/maptod_transfer.f90 | 62 +++++++--------- src/memory_and_time.f90 | 146 +++++++++++++++++++++++-------------- src/noise_routines.f90 | 25 ++----- src/parameter_control.f90 | 12 +-- src/pointing.f90 | 12 +-- src/smadam.F90 | 24 +++--- src/tod_storage.f90 | 11 +-- 8 files changed, 143 insertions(+), 172 deletions(-) diff --git a/src/maps_and_baselines.f90 b/src/maps_and_baselines.f90 index 79e7019..1e68104 100644 --- a/src/maps_and_baselines.f90 +++ b/src/maps_and_baselines.f90 @@ -8,7 +8,7 @@ MODULE maps_and_baselines noba_short, nodetectors, do_binmap, do_hits, do_mask, & ndetset, nmap, nopix_cross, nopix_map, nsurvey, use_inmask use mpi_wrappers, only : min_mpi, max_mpi, sum_mpi - use memory_and_time, only : check_stat + use memory_and_time, only : check_stat, write_memory implicit none private @@ -102,16 +102,7 @@ SUBROUTINE allocate_maps allocate(cca(nmap, nmap, 0:0), wamap(nmap, 0:0)) endif - memsum = memory_maps / 2. ** 20. - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - - if (ID==0 .and. info > 0) then - write(*,mstr3) 'Allocated memory for maps:', memsum, mem_min, mem_max - end if + call write_memory("Map memory", memory_maps) END SUBROUTINE allocate_maps @@ -171,15 +162,7 @@ SUBROUTINE allocate_baselines allocate(yba(1, 1, 1), nna(1, 1, 1, 1)) endif - memsum = memory_baselines / 2**20 - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - - if (ID==0 .and. info > 0) write(*,mstr3) & - 'Allocated memory for baselines:', memsum, mem_min, mem_max + call write_memory("Baseline memory", memory_baselines) END SUBROUTINE allocate_baselines diff --git a/src/maptod_transfer.f90 b/src/maptod_transfer.f90 index 6085ff4..5234e36 100644 --- a/src/maptod_transfer.f90 +++ b/src/maptod_transfer.f90 @@ -2,6 +2,7 @@ MODULE maptod_transfer use commonparam use mpi_wrappers + use memory_and_time, only : write_memory implicit none private @@ -54,31 +55,42 @@ SUBROUTINE update_maptod_transfer(ksubmap) logical, allocatable, intent(in) :: ksubmap(:) integer(i4b) :: ierr real(sp) :: memsum, mem_min, mem_max + integer :: nolocmaps_min, nolocmaps_max + real(dp) :: nolocmaps_sum nolocmaps = count(ksubmap(0:nosubmaps_tot-1)) nolocpix = nolocmaps * nosubpix_max + nolocmaps_min = nolocmaps + nolocmaps_max = nolocmaps + nolocmaps_sum = nolocmaps + call min_mpi(nolocmaps_min) + call max_mpi(nolocmaps_max) + call sum_mpi(nolocmaps_sum) + if (ID == 0 .and. info > 0) then + write(*, '(x,"Total submaps = ",i0," submap size = ",i0)') & + nosubmaps_tot, nosubpix_max + write(*, '(x,"Local submaps: min = ",i0," max = ",i0," mean = ",f8.2)') & + nolocmaps_min, nolocmaps_max, nolocmaps_sum / ntasks + end if + if (.not. allocated(ksubmap_table)) then ! Default Fortran logical is 32 bits (!) if (allreduce) then - allocate(ksubmap_table(0:nosubmaps_tot-1, 0:0), stat=ierr) + allocate(ksubmap_table(0:nosubmaps_tot - 1, 0:0), stat=ierr) memory_ksubmap = nosubmaps_tot*4 else - allocate(ksubmap_table(0:nosubmaps_tot-1, 0:ntasks-1), stat=ierr) - memory_ksubmap = nosubmaps_tot*ntasks*4 + allocate(ksubmap_table(0:nosubmaps_tot - 1, 0:ntasks-1), stat=ierr) + memory_ksubmap = nosubmaps_tot * ntasks * 4 end if if (ierr /= 0) call abort_mpi('No room for ksubmap_table') ksubmap_table = .false. - if (ID==0 .and. info > 0) then - memsum = memory_ksubmap / 2**20 - write(*,mstr3) 'Allocated memory for submap table:', & - memsum, memsum, memsum - end if + call write_memory('Submap table memory', memory_ksubmap) end if if (allreduce) then ! All processes share ksubmap - ksubmap_table(:, 0) = ksubmap(0:nosubmaps_tot-1) + ksubmap_table(:, 0) = ksubmap(0:nosubmaps_tot - 1) else call mpi_allgather(ksubmap, nosubmaps_tot, MPI_LOGICAL, & ksubmap_table, nosubmaps_tot, MPI_LOGICAL, comm, ierr) @@ -98,7 +110,7 @@ SUBROUTINE update_maptod_transfer(ksubmap) if (ierr /= 0) call abort_mpi('Failed to allocate locmap') - memory_locmap = (nsize_locmap+1) * (nmap*8.+nmap**2*8.+8) + memory_locmap = (nsize_locmap + 1) * (nmap * 8 + nmap ** 2 * 8 + 4) end if @@ -107,17 +119,7 @@ SUBROUTINE update_maptod_transfer(ksubmap) locmask = 0 lochits = 0 - memsum = memory_locmap / 2**20 - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - - if (ID==0 .and. info > 0) then - write(*,mstr3) 'Allocated memory for local maps:', & - memsum, mem_min, mem_max - end if + call write_memory('local maps memory', memory_locmap) END SUBROUTINE update_maptod_transfer @@ -165,14 +167,7 @@ SUBROUTINE initialize_alltoallv() + recvcounts_gather(itask-1) end do - memsum = memory_all2all / 2**20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for allreduce:', memsum, mem_min, mem_max - end if + call write_memory("Allreduce memory", memory_all2all) end if ! allreduce implies concatenate_messages = .false. @@ -293,14 +288,7 @@ SUBROUTINE initialize_alltoallv() offset = offset + nrecv end do - memsum = memory_all2all / 2**20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for all2allv:', memsum, mem_min, mem_max - end if + call write_memory("All2allv memory", memory_all2all) END SUBROUTINE initialize_alltoallv diff --git a/src/memory_and_time.f90 b/src/memory_and_time.f90 index b08d6c2..6907c35 100644 --- a/src/memory_and_time.f90 +++ b/src/memory_and_time.f90 @@ -34,26 +34,57 @@ END SUBROUTINE check_stat !------------------------------------------------------------------------------ - SUBROUTINE write_memory(text, memory_in) - - character(len=*), intent(in) :: text - real(dp), intent(in), optional :: memory_in - real(dp) :: memory_sum - - if (text=='Total') then - if (ID==0) then - write(*, '(x,a,t26,f9.1," MB")') text, memory_total - write(*, '(x,a,t26,f9.1," GB")') ' ', memory_total/1024. - endif + function mem_to_string(mem) result(memstring) + ! Return a string representation of memory value + real(dp) :: mem ! memory in MB + character(len=12) :: memstring + character(len=2) :: unit + + if (mem > 2 ** 30) then + mem = mem / 2 ** 30 + unit = 'PB' + else if (mem > 2 ** 20) then + mem = mem / 2 ** 20 + unit = 'TB' + else if (mem > 2 ** 10) then + mem = mem / 2 ** 10 + unit = 'GB' + else if (mem > 1) then + unit = 'MB' + else if (mem > 1 / 2 ** 10) then + mem = mem * 2 ** 10 + unit = 'kB' else - memory_sum = memory_in/1024./1024. - call sum_mpi(memory_sum) + mem = mem ** 2 ** 20 + unit = 'B ' + end if + write(memstring, '(f9.2,x,a2)') mem, unit + end function mem_to_string - memory_total = memory_total+memory_sum - if (ID==0.and.memory_sum.gt.0.05) & - write(*, '(x,a,t26,f9.1)') text, memory_sum - endif + SUBROUTINE write_memory(text, memory_in) + character(len=*), intent(in) :: text ! prefix the memory string + real(dp), intent(in), optional :: memory_in ! memory in bytes + real(dp) :: memory, memory_sum, memory_min, memory_max + + if (text == 'Total') then + memory = memory_total + else + memory = memory_in / 2 ** 20 + memory_total = memory_total + memory + end if + + memory_min = memory + memory_max = memory + memory_sum = memory + call min_mpi(memory_min) + call max_mpi(memory_max) + call sum_mpi(memory_sum) + if (ID == 0 .and. info > 0) then + write(*, '(x,a,t28,"min = ",a," max =",a," total =",a)') text, & + mem_to_string(memory_min), mem_to_string(memory_max), & + mem_to_string(memory_sum) + end if END SUBROUTINE write_memory @@ -66,55 +97,58 @@ SUBROUTINE write_time(text, time) character(len=*), intent(in) :: text real(dp), intent(in), optional :: time - real(dp) :: time_min, time_max ! -RK - - if (text=='- Other') then - time2 = time1-time2_sum - - if (ID==0.and.time2.gt.0.1) & - write(*,'(3x,a,t32,f7.1)') text, time2 - - elseif (text(1:1)=='-') then - time2 = time - - time_min = time2; time_max = time2 ! -RK - call min_mpi(time_min); call max_mpi(time_max) ! -RK - + real(dp) :: time_min, time_max + + if (text(1:1) == '-') then + if (text == '- Other') then + ! Difference between top level timer and the sum of the sub timers + time2 = time1 - time2_sum + else + ! sub timer + time2 = time + end if + time_min = time2 + time_max = time2 + call min_mpi(time_min) + call max_mpi(time_max) call sum_mpi(time2) - - time2 = time2/ntasks - time2_sum = time2_sum+time2 - - !if (ID==0.and.time2.gt.0.05) write(*,'(3x,a,t32,f8.1)') text, time2 - if (ID==0.and.time2.gt.0.05) write(*,'(3x,a,t32,3f8.1)') & ! -RK - text, time2, time_min, time_max ! -RK - - elseif (text=='Total') then + time2 = time2 / ntasks + time2_sum = time2_sum + time2 + + if (ID == 0 .and. time2 > 0.05) then + write (*,'(3x,a,t34,"mean =",f8.1," min =",f8.1," max =",f8.1)') & + text, time2, time_min, time_max + end if + else if (text == 'Total') then + ! zero level timer time1 = time call sum_mpi(time1) - time1 = time1/ntasks - - if (ID==0) write(*,'(x,a,t28,f8.1," s (",f8.2," CPU hours)")') & - text, time1, time1*ntasks*nthreads_max/3600. + time1 = time1 / ntasks + if (ID == 0) then + write(*,'(x,a,t28,f8.1," s (",f12.2," CPU hours)")') & + text, time1, time1 * ntasks * nthreads_max / 3600. + end if else + ! New top level timer, possibly followed by sub timers time1 = time + time_min = time1 + time_max = time1 + call min_mpi(time_min) + call max_mpi(time_max) - time_min = time1; time_max = time1 ! -RK - call min_mpi(time_min); call max_mpi(time_max) ! -RK - - time_cum = time_cum+time1 + time_cum = time_cum + time1 call sum_mpi(time1) - time1 = time1/ntasks - time2_sum = 0.0 - - !if (ID==0.and.time1.gt.0.05) write (*,'(x,a,t28,f8.1)') text, time1 -RK - if (ID==0.and.time1.gt.0.05) write (*,'(x,a,t28,3f8.1)') & - text, time1, time_min, time_max - - endif + time1 = time1 / ntasks + time2_sum = 0 + + if (ID == 0 .and. time1 > 0.05) then + write (*,'(x,a,t34,"mean =",f8.1," min =",f8.1," max =",f8.1)') & + text, time1, time_min, time_max + end if + end if END SUBROUTINE write_time diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 8d0228f..6769f2a 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -7,6 +7,7 @@ MODULE noise_routines use fourier use mpi_wrappers use timing + use memory_and_time, only : write_memory use tod_storage, only : sampletime @@ -392,16 +393,9 @@ SUBROUTINE build_filter() psddet(npsdtot), psdind(npsdtot), psdstart(npsdtot), psdstop(npsdtot), & stat=ierr) if (ierr /= 0) call abort_mpi('No room for fcov') - memory_filter = memory_filter + (nof / 2 + 1)*npsdtot*16. + npsdtot*24. - - memsum = memory_filter / 2 ** 20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(1x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for filter:', memsum, mem_min, mem_max - end if + memory_filter = memory_filter + (nof / 2 + 1) * npsdtot * 16 + npsdtot * 24 + + call write_memory("Filter memory", memory_filter) kread_file = (len_trim(file_spectrum) > 0) if (kread_file) call read_spectrum @@ -1142,7 +1136,7 @@ SUBROUTINE construct_preconditioner(nna) print *, 'Constructed ', sum(ntries), ' band preconditioners.' do try = 1, trymax if (ntries(try) > 0) print *, ntries(try), ' at width = ', & - min(try*precond_width_min, precond_width_max) + min(try * precond_width_min, precond_width_max) end do if (nfail > 0) print *, ' ', nfail, ' failed (using CG)' if (nempty > 0) print *, ' Skipped ', nempty, ' empty intervals.' @@ -1150,14 +1144,7 @@ SUBROUTINE construct_preconditioner(nna) end if - memsum = memory_precond / 2 ** 20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(1x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for precond:', memsum, mem_min, mem_max - end if + call write_memory("Precond memory", memory_precond) cputime_prec_construct = cputime_prec_construct + get_time(16) diff --git a/src/parameter_control.f90 b/src/parameter_control.f90 index 49acbfe..4633140 100644 --- a/src/parameter_control.f90 +++ b/src/parameter_control.f90 @@ -8,6 +8,7 @@ MODULE parameter_control use pointing, only : subchunk use maps_and_baselines, only : memory_baselines, memory_maps, & memory_basis_functions + use memory_and_time, only : write_memory implicit none private @@ -933,16 +934,7 @@ SUBROUTINE start_timeloop() end do end if ! if (kfirst) - memsum = memory_basis_functions / 2**20 - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,mstr3) 'Allocated memory for basis_functions:', & - memsum, mem_min, mem_max - end if + call write_memory("Basis function memory", memory_basis_functions) !end if diff --git a/src/pointing.f90 b/src/pointing.f90 index acd2d37..4334c01 100644 --- a/src/pointing.f90 +++ b/src/pointing.f90 @@ -5,7 +5,7 @@ MODULE pointing use commonparam use mpi_wrappers - use memory_and_time, only : check_stat + use memory_and_time, only : check_stat, write_memory implicit none private @@ -59,15 +59,7 @@ SUBROUTINE init_pointing dummy_pixel = 12 * nside_max ** 2 - memory = memory_pointing / 2d0 ** 20 - - mem_min = memory; mem_max = memory - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memory) - - if (id == 0 .and. info > 0) write(*,'(a,t32,3(f12.1," MB"))') & - ' Allocated memory for pointing:', memory, mem_min, mem_max + call write_memory("Pointing memory", memory_pointing) END SUBROUTINE init_pointing diff --git a/src/smadam.F90 b/src/smadam.F90 index 038c807..201f980 100644 --- a/src/smadam.F90 +++ b/src/smadam.F90 @@ -457,6 +457,7 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & if (id == 0) write(*,*) if (id == 0) write(*,*) 'MEMORY (MB):' + memory_total = 0 call write_memory('Detector pointing', memory_pointing) call write_memory('TOD buffer', memory_tod) call write_memory('Maps', memory_maps) @@ -824,17 +825,18 @@ subroutine destripe_with_cache(comm, ndet, nsamp, nnz, & if (id == 0) write(*,*) if (id == 0) write(*,*) 'MEMORY (MB):' - call write_memory('Detector pointing', memory_pointing) - call write_memory('TOD buffer', memory_tod) - call write_memory('Maps', memory_maps) - call write_memory('Baselines', memory_baselines) - call write_memory('Basis functions', memory_basis_functions) - call write_memory('Noise filter', memory_filter) - call write_memory('Preconditioner', memory_precond) - call write_memory('Submap table', memory_ksubmap) - call write_memory('Temporary maps', memory_locmap) - call write_memory('All2All buffers', memory_all2all) - call write_memory('CG work space', memory_cg) + memory_total = 0 + call write_memory('Detector pointing', memory_pointing) + call write_memory('TOD buffer', memory_tod) + call write_memory('Maps', memory_maps) + call write_memory('Baselines', memory_baselines) + call write_memory('Basis functions', memory_basis_functions) + call write_memory('Noise filter', memory_filter) + call write_memory('Preconditioner', memory_precond) + call write_memory('Submap table', memory_ksubmap) + call write_memory('Temporary maps', memory_locmap) + call write_memory('All2All buffers', memory_all2all) + call write_memory('CG work space', memory_cg) call write_memory('NCM', memory_ncm) call write_memory('Total') end if diff --git a/src/tod_storage.f90 b/src/tod_storage.f90 index eae6107..a2acf95 100644 --- a/src/tod_storage.f90 +++ b/src/tod_storage.f90 @@ -6,7 +6,7 @@ MODULE tod_storage use commonparam use mpi_wrappers - use memory_and_time, only : check_stat + use memory_and_time, only : check_stat, write_memory implicit none private @@ -43,14 +43,7 @@ SUBROUTINE allocate_tod memory_tod = memory_tod + nosamples_proc surveyflags = .true. - memsum = memory_tod/1024./1024. - - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - - if (id == 0 .and. info >= 1) & - write(*,mstr3) 'Allocated memory for TOD:', memsum, mem_min, mem_max + call write_memory("TOD memory", memory_tod) END SUBROUTINE allocate_tod From 247db627178e574f4a08f905f9f893c251c40d63 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Sat, 31 Aug 2019 09:55:55 -0700 Subject: [PATCH 13/17] avoid overflow --- src/maptod_transfer.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/maptod_transfer.f90 b/src/maptod_transfer.f90 index 5234e36..9fafe56 100644 --- a/src/maptod_transfer.f90 +++ b/src/maptod_transfer.f90 @@ -110,7 +110,7 @@ SUBROUTINE update_maptod_transfer(ksubmap) if (ierr /= 0) call abort_mpi('Failed to allocate locmap') - memory_locmap = (nsize_locmap + 1) * (nmap * 8 + nmap ** 2 * 8 + 4) + memory_locmap = (nsize_locmap + 1.) * (nmap * 8. + nmap ** 2 * 8. + 4) end if From c785d1a70fee891da72f148e7c8074c26d138b0b Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Sat, 31 Aug 2019 09:56:33 -0700 Subject: [PATCH 14/17] update dependencies --- src/Makefile.am | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index a320394..ae05abb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -72,7 +72,8 @@ map_routines.lo : map_routines.f90 commonparam.lo matrix.lo mpi_wrappers.lo \ submap_transfer.lo : submap_transfer.f90 commonparam.lo mpi_wrappers.lo parameter_control.lo : parameter_control.f90 commonparam.lo \ - noise_routines.lo pointing.lo mpi_wrappers.lo maps_and_baselines.lo + noise_routines.lo pointing.lo mpi_wrappers.lo maps_and_baselines.lo \ + memory_and_time.lo pointing.lo : pointing.f90 commonparam.lo mpi_wrappers.lo memory_and_time.lo @@ -84,14 +85,16 @@ maps_and_baselines.lo : maps_and_baselines.f90 commonparam.lo mpi_wrappers.lo \ tod_storage.lo : tod_storage.f90 commonparam.lo mpi_wrappers.lo memory_and_time.lo noise_routines.lo : noise_routines.f90 commonparam.lo fourier_fftw_2003.lo \ - mpi_wrappers.lo timing.lo tod_storage.lo + mpi_wrappers.lo timing.lo tod_storage.lo memory_and_time.lo memory_and_time.lo : memory_and_time.f90 commonparam.lo mpi_wrappers.lo output.lo : output.f90 commonparam.lo submap_transfer.lo \ - mpi_wrappers.lo timing.lo maps_and_baselines.lo tod_storage.lo fitsmod2.lo + mpi_wrappers.lo timing.lo maps_and_baselines.lo tod_storage.lo \ + fitsmod2.lo -maptod_transfer.lo : maptod_transfer.f90 commonparam.lo mpi_wrappers.lo +maptod_transfer.lo : maptod_transfer.f90 commonparam.lo mpi_wrappers.lo \ + memory_and_time.lo smadam_routines.lo : smadam_routines.f90 commonparam.lo mpi_wrappers.lo \ maptod_transfer.lo noise_routines.lo map_routines.lo \ @@ -115,7 +118,6 @@ smadam.lo : smadam.F90 commonparam.lo inputparam.lo \ covmat.lo covmat_util.lo - libmadam_la_LIBADD = $(AM_LIBS) #libmadam_la_LDFLAGS = -Wl,--no-undefined From 09e50cc5851f6dae7f5efe94acf269ff9bd1591f Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Tue, 3 Sep 2019 12:52:35 -0700 Subject: [PATCH 15/17] make the aligned FFT default again and use FFTW_MEASURE as the default planning strategy --- src/commonparam.f90 | 2 +- src/fourier_fftw_2003.f90 | 17 +++++++---------- src/parameter_control.f90 | 11 +++++++---- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/commonparam.f90 b/src/commonparam.f90 index 991e1f4..8bdfd30 100644 --- a/src/commonparam.f90 +++ b/src/commonparam.f90 @@ -130,7 +130,7 @@ MODULE commonparam real(dp) :: dnshort=-1 integer :: nlong=-1, nshort=-1 logical :: kfirst=.true., kfilter=.false. - logical :: unaligned_fft=.true. + logical :: unaligned_fft=.false. real(dp) :: cglimit=1.d-12 integer :: iter_min=3, iter_max=1000 diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90 index 6f9e363..714bc9c 100644 --- a/src/fourier_fftw_2003.f90 +++ b/src/fourier_fftw_2003.f90 @@ -37,15 +37,13 @@ SUBROUTINE init_fourier(nof_in, unaligned) nof = nof_in nofinv = 1 / dble(nof) - !allocate(in(nof), out(nof/2+1), stat=ierr) - !if (ierr /= 0) then - ! stop 'Unable to allocate working space for Fourier transform' - !end if pin = fftw_alloc_real(int(nof, C_SIZE_T)) pout = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) call c_f_pointer(pin, in, [nof]) call c_f_pointer(pout, out, [nof/2 + 1]) + ! It may put a strain on the filesystem to load system wisdom + ! from every process. !!$ ierr = fftw_import_system_wisdom() !!$ !!$ if (info /= 0) then @@ -56,11 +54,11 @@ SUBROUTINE init_fourier(nof_in, unaligned) !!$ end if !!$ end if - !if (nof <= 65536) then - !fftw_planning_strategy = FFTW_MEASURE - !else - fftw_planning_strategy = FFTW_ESTIMATE - !end if + if (nof <= 1000000) then + fftw_planning_strategy = FFTW_MEASURE + else + fftw_planning_strategy = FFTW_ESTIMATE + end if fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT) !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_PRESERVE_INPUT) @@ -71,7 +69,6 @@ SUBROUTINE init_fourier(nof_in, unaligned) plan = fftw_plan_dft_r2c_1d(nof, in, out, fftw_planning_strategy) plan_inv = fftw_plan_dft_c2r_1d(nof, out, in, fftw_planning_strategy) - !deallocate(in, out) call fftw_free(pin) call fftw_free(pout) diff --git a/src/parameter_control.f90 b/src/parameter_control.f90 index 4633140..ffda8e7 100644 --- a/src/parameter_control.f90 +++ b/src/parameter_control.f90 @@ -590,10 +590,13 @@ SUBROUTINE write_parameters() write (*,fi) 'nside_cross', nside_cross, 'Healpix resolution (destriping)' if (info > 1) & write (*,fi) 'nside_submap', nside_submap, 'Submap resolution' - write (*,fk) 'concatenate_messages', concatenate_messages, & - 'use mpi_alltoallv to communicate' - write (*,fk) 'allreduce', allreduce, & - 'use allreduce to communicate' + if (allreduce) then + write (*,fk) 'allreduce', allreduce, & + 'use allreduce to communicate' + else + write (*,fk) 'concatenate_messages', concatenate_messages, & + 'use mpi_alltoallv to communicate' + end if write (*,fk) 'reassign_submaps', reassign_submaps, & 'minimize communication by reassigning submaps' From 9fdbbae7b1ffb386b4ddddd5e59b6c906cc843ed Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Tue, 3 Sep 2019 12:53:07 -0700 Subject: [PATCH 16/17] always use FFTW_MEASURE as the planning strategy --- src/fourier_fftw_2003.f90 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90 index 714bc9c..011e15f 100644 --- a/src/fourier_fftw_2003.f90 +++ b/src/fourier_fftw_2003.f90 @@ -54,11 +54,11 @@ SUBROUTINE init_fourier(nof_in, unaligned) !!$ end if !!$ end if - if (nof <= 1000000) then - fftw_planning_strategy = FFTW_MEASURE - else - fftw_planning_strategy = FFTW_ESTIMATE - end if + !if (nof <= 1000000) then + fftw_planning_strategy = FFTW_MEASURE + !else + ! fftw_planning_strategy = FFTW_ESTIMATE + !end if fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT) !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_PRESERVE_INPUT) From 26b6500d3714749a1bf47de06b1ae49cf57b8377 Mon Sep 17 00:00:00 2001 From: Reijo Keskitalo Date: Tue, 24 Sep 2019 14:48:30 -0700 Subject: [PATCH 17/17] Move parameter defaults into a separate function --- src/commonparam.f90 | 233 +++++++++++++++++++++++++++++++------------- src/smadam.F90 | 2 + 2 files changed, 168 insertions(+), 67 deletions(-) diff --git a/src/commonparam.f90 b/src/commonparam.f90 index 8bdfd30..e3b433b 100644 --- a/src/commonparam.f90 +++ b/src/commonparam.f90 @@ -34,41 +34,38 @@ MODULE commonparam real(dp), allocatable :: plateaus(:) END TYPE detector_data - real(dp) :: fsample = 1 ! Sampling frequency - character(len=30) :: unit_tod = 'unknown' + real(dp) :: fsample ! Sampling frequency + character(len=30) :: unit_tod ! OpenMP - integer :: nthreads_max=1, nthreads=1, id_thread=0 + integer :: nthreads_max, nthreads, id_thread integer, external :: omp_get_num_procs, omp_get_max_threads, & omp_get_thread_num, omp_get_num_threads - logical :: flag_by_horn=.false., force_pol=.false. - logical :: concatenate_messages = .true., allreduce = .false. - logical :: reassign_submaps = .true. - logical :: noise_weights_from_psd = .false. ! integrate noise weights internally + logical :: flag_by_horn, force_pol + logical :: concatenate_messages, allreduce, reassign_submaps + logical :: noise_weights_from_psd ! integrate noise weights internally ! Assume well-behaved noise spectrum without low pass filtering - logical :: radiometers = .true. - integer(i8b) :: psd_downsample=10 - integer (i8b) :: psdlen=1e6 + logical :: radiometers + integer(i8b) :: psd_downsample, psdlen ! Enable sub ring map making - integer(i2b) :: nsubchunk=0 - integer(i2b) :: isubchunk=0 - real(dp) :: fnoise_max=1000 ! When measuring noise variance, use this limit + integer(i2b) :: nsubchunk, isubchunk + real(dp) :: fnoise_max character(len=SLEN) :: file_profile = '' character(len=SLEN) :: file_intermediate_profile = '' - logical :: checknan=.false. ! Can cost time - real(dp) :: diagfilter=0 - logical :: sync_output=.true., skip_existing=.false. - logical :: write_cut=.false. - logical :: tod_is_clean = .false. - logical :: binary_output=.false., concatenate_binary=.false. + logical :: checknan + real(dp) :: diagfilter + logical :: sync_output, skip_existing + logical :: write_cut + logical :: tod_is_clean + logical :: binary_output, concatenate_binary ! Used for concatenate_binary when storing multiple MC maps - integer :: record_number=1 + integer :: record_number ! Number of independent groups of processes writing binary maps - integer(i4b) :: nwrite_binary = 10 + integer(i4b) :: nwrite_binary integer(i4b), parameter :: basis_poly=1, basis_fourier=2, basis_cheby=3, & basis_legendre=4 - integer(i4b) :: basis_func=basis_legendre, basis_order=0 + integer(i4b) :: basis_func, basis_order type :: basis_function_type integer(i8b) :: nsamp logical :: copy @@ -100,71 +97,68 @@ MODULE commonparam integer(i4b) :: nsurvey logical(byte), allocatable :: surveyflags(:) - logical :: bin_subsets = .false. - logical :: mcmode = .false., cached = .false. + logical :: bin_subsets + logical :: mcmode, cached - real(dp) :: good_baseline_fraction=0 ! default accepts all baselines + real(dp) :: good_baseline_fraction ! monte Carlo mode - integer(idp) :: mc_increment=1e7, mc_loops=1, mc_id=0, rng_base=0 - logical :: incomplete_matrices = .false. + integer(idp) :: mc_increment, mc_loops, mc_id, rng_base + logical :: incomplete_matrices - integer :: ID = 0 - integer :: ntasks = 1 + integer :: ID + integer :: ntasks character(len=*), parameter :: version = '3.7' character(len=*), parameter :: idf = '(i4,": ",a,1x,i0,2x,i0)' ! Input parameters - integer :: info=2 + integer :: info - integer :: nside_map=512, nside_cross=-1, nside_submap=16 + integer :: nside_map, nside_cross, nside_submap - real(dp) :: time_unit=-5.d0 - real(dp) :: mission_time=0.0 - integer :: nofiles=-1 + real(dp) :: time_unit + real(dp) :: mission_time + integer :: nofiles - integer :: pixmode_map=2, pixmode_cross=2 - real(dp) :: pixlim_map=1e-6, pixlim_cross=1e-3 - logical :: allow_decoupling = .false. + integer :: pixmode_map, pixmode_cross + real(dp) :: pixlim_map, pixlim_cross + logical :: allow_decoupling - real(dp) :: dnshort=-1 - integer :: nlong=-1, nshort=-1 - logical :: kfirst=.true., kfilter=.false. - logical :: unaligned_fft=.false. + real(dp) :: dnshort + integer :: nlong, nshort + logical :: kfirst, kfilter + logical :: unaligned_fft - real(dp) :: cglimit=1.d-12 - integer :: iter_min=3, iter_max=1000 - integer :: precond_width_min=10, precond_width_max=100 - logical :: use_fprecond=.false., use_cgprecond=.false. + real(dp) :: cglimit + integer :: iter_min, iter_max + integer :: precond_width_min, precond_width_max + logical :: use_fprecond, use_cgprecond - integer :: mode_detweight=0 + integer :: mode_detweight - logical :: rm_monopole=.false., temperature_only=.false. + logical :: rm_monopole, temperature_only ! Input files - character(len=SLEN) :: file_param='', & - file_inmask='', file_spectrum='', file_gap='' + character(len=SLEN) :: file_param, file_inmask, file_spectrum, file_gap ! Output files - character(len=SLEN) :: file_root='madam' - character(len=SLEN) :: file_map='', file_hit='', file_base='' - character(len=SLEN) :: file_matrix='', file_mask='', file_binmap='' - character(len=SLEN) :: file_wcov='', file_leakmatrix='' - character(len=SLEN) :: file_gap_out='', file_mc='', path_output='' + character(len=SLEN) :: file_root, file_map, file_hit, file_base, & + file_matrix, file_mask, file_binmap, file_wcov, file_leakmatrix, & + file_gap_out, file_mc, path_output - logical :: write_tod=.false. + logical :: write_tod ! LFI specific keywords - character(len=80) :: instrument = '' + character(len=80) :: instrument ! NCVM specific parameters - logical :: kwrite_covmat=.false. - character(len=SLEN) :: file_covmat = '' + logical :: kwrite_covmat + character(len=SLEN) :: file_covmat type(detector_data), allocatable, target :: detectors(:) ! Derived directly from input parameters - integer :: nmap=0, ncc=0, nside_max, nodetectors=-1 + integer :: nmap, ncc, nside_max, nodetectors ! Pixels integer :: nopix_map, nopix_cross @@ -186,7 +180,7 @@ MODULE commonparam real(i8b), allocatable :: baselines_short_time(:) ! Number of pointing periods and their duration as a number of samples - integer(i4b) :: ninterval = -1, ninterval_tot = -1 + integer(i4b) :: ninterval, ninterval_tot integer(i8b), allocatable :: intervals(:) integer(i4b), allocatable :: interval_id(:) @@ -196,14 +190,119 @@ MODULE commonparam integer(i4b), allocatable :: id_submap(:) integer(i4b) :: id_next, id_prev - logical :: do_map=.true., do_binmap=.false., do_hits=.false. - logical :: do_mask=.false., do_matrix=.false., do_wcov=.false. - logical :: do_base=.false., do_leakmatrix=.false. - logical :: do_wnmap=.false. - logical :: use_inmask - - integer :: noiter = 0 + logical :: do_map, do_binmap, do_hits, do_mask, do_matrix, do_wcov, do_base, & + do_leakmatrix, do_wnmap, use_inmask + integer :: noiter !-------------------------------------------------------------------------- +contains + + subroutine set_parameter_defaults() + ! Set or restore commonparam values to defaults + fsample = 1 + unit_tod = "unknown" + nthreads_max = 1 + nthreads = 1 + id_thread = 1 + flag_by_horn = .false. + force_pol = .false. + concatenate_messages = .true. + allreduce = .false. + reassign_submaps = .true. + noise_weights_from_psd = .false. + radiometers = .true. + psd_downsample = 10 + psdlen = 1e6 + nsubchunk = 0 + isubchunk = 0 + fnoise_max = 1000 ! When measuring noise variance, use this limit + checknan = .false. ! Can cost time + diagfilter = 0 + sync_output = .true. + skip_existing = .false. + write_cut = .false. + tod_is_clean = .false. + binary_output = .false. + concatenate_binary = .false. + record_number = 1 + nwrite_binary = 10 + basis_func = basis_legendre + basis_order = 0 + bin_subsets = .false. + mcmode = .false. + cached = .false. + good_baseline_fraction = 0 ! default accepts all baselines + mc_increment = 1e7 + mc_loops = 1 + mc_id = 0 + rng_base = 0 + incomplete_matrices = .false. + ID = 0 + ntasks = 1 + info = 2 + nside_map = 512 + nside_cross = -1 + nside_submap = 16 + time_unit = -5 + mission_time = 0 + nofiles = -1 + pixmode_map = 2 + pixmode_cross = 2 + pixlim_map = 1e-6 + pixlim_cross = 1e-3 + allow_decoupling = .false. + dnshort = -1 + nlong = -1 + nshort = -1 + kfirst = .true. + kfilter = .false. + unaligned_fft = .false. + cglimit = 1e-12 + iter_min = 3 + iter_max = 1000 + precond_width_min = 10 + precond_width_max = 100 + use_fprecond = .false. + use_cgprecond = .false. + mode_detweight = 0 + rm_monopole = .false. + temperature_only = .false. + file_param = "" + file_inmask = "" + file_spectrum = "" + file_gap = "" + file_root = "madam" + file_map = "" + file_hit = "" + file_base = "" + file_matrix = "" + file_mask = "" + file_binmap = "" + file_wcov = "" + file_leakmatrix = "" + file_gap_out = "" + file_mc = "" + path_output = "" + write_tod = .false. + instrument = "" + kwrite_covmat = .false. + file_covmat = "" + nmap = 0 + ncc = 0 + nodetectors = -1 + ninterval = -1 + ninterval_tot = -1 + do_map = .true. + do_binmap = .false. + do_hits = .false. + do_mask = .false. + do_matrix = .false. + do_wcov = .false. + do_base = .false. + do_leakmatrix = .false. + do_wnmap = .false. + noiter = 0 + end subroutine set_parameter_defaults + END MODULE commonparam diff --git a/src/smadam.F90 b/src/smadam.F90 index 201f980..38af411 100644 --- a/src/smadam.F90 +++ b/src/smadam.F90 @@ -104,6 +104,8 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & integer(i2b) :: subchunkcounter integer(i8b) :: pixmin, pixmax + call set_parameter_defaults() + ! set up MPI call init_mpi(comm, ntasks, id)