From 09600886fbc120cd81d1dd289d9db8af56d0927e Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Mon, 19 Aug 2019 11:14:00 -0700
Subject: [PATCH 01/17] add extensive profiling to the CG iteration

---
 src/noise_routines.f90  | 90 +++++++++++++++++++++++++++++++----------
 src/smadam_routines.f90 | 31 ++++++++++++--
 2 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 8e81426..3e4bacc 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -1137,7 +1137,7 @@ END SUBROUTINE construct_preconditioner
   !------------------------------------------------------------------------------
 
 
-  SUBROUTINE preconditioning_band(z, r, nna)
+  SUBROUTINE preconditioning_band(z, r, nna, istep)
     !Apply the preconditioner
 
     ! z and r may be 3-dimensional arrays in the calling code but
@@ -1147,14 +1147,19 @@ SUBROUTINE preconditioning_band(z, r, nna)
     real(dp), intent(in), target :: r(0:basis_order, noba_short, nodetectors)
     real(dp), intent(in)  :: &
          nna(0:basis_order, 0:basis_order, noba_short, nodetectors)
+    integer, intent(in) :: istep
 
     integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, &
-         num_threads, m, nband, ipsd, isub
+         num_threads, m, nband, ipsd, isub, niter
     real(dp) :: t1, t2, tf
 
     real(C_DOUBLE), pointer :: xx(:) => NULL()
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
     type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1)
+    ! DEBUG begin
+    real(dp) :: time_cholesky(100), time_cgprecond(100), time_cgprecond_filter(100)
+    integer :: ncall_cholesky(100), ncall_cgprecond(100), niter_cgprecond(100)
+    ! DEBUG end
 
     z = r
 
@@ -1177,14 +1182,25 @@ SUBROUTINE preconditioning_band(z, r, nna)
           pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
        end do
 
+       ! DEBUG begin
+       time_cholesky = 0
+       time_cgprecond = 0
+       time_cgprecond_filter = 0
+       ncall_cholesky = 0
+       ncall_cgprecond = 0
+       niter_cgprecond = 0
+       ! DEBUG end
+
        !$OMP PARALLEL NUM_THREADS(nthreads) &
        !$OMP     DEFAULT(NONE) &
        !$OMP     SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, &
        !$OMP         id, nof, nshort, detectors, nthreads, fprecond, nna, &
-       !$OMP         baselines_short_time, invcov, fcov, checknan, pxx, pfx) &
+       !$OMP         baselines_short_time, invcov, fcov, checknan, pxx, pfx, &
+       !$OMP         time_cholesky, ncall_cholesky, time_cgprecond, &
+       !$OMP         ncall_cgprecond, niter_cgprecond, time_cgprecond_filter) &
        !$OMP     PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, &
        !$OMP         xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, &
-       !$OMP         t1, t2, tf)
+       !$OMP         t1, t2, tf, niter)
 
        !t1 = get_time(55)
        !tf = 0
@@ -1211,16 +1227,30 @@ SUBROUTINE preconditioning_band(z, r, nna)
                    ipsd = psd_index(idet, baselines_short_time(kstart+1))
                    if (allocated(bandprec(isub, ichunk, idet)%data)) then
                       ! Use the precomputed Cholesky decomposition
+                      t1 = MPI_Wtime()  ! DEBUG
                       nband = size(bandprec(isub, ichunk, idet)%data, 1)
                       call apply_cholesky_decomposition( &
                            noba, z(0, kstart+1, idet), r(0, kstart+1, idet), &
                            nband, bandprec(isub, ichunk, idet)%data)
+                      ! DEBUG begin
+                      t2 = MPI_Wtime()
+                      time_cholesky(id_thread) = time_cholesky(id_thread) + (t2 - t1)
+                      ncall_cholesky(id_thread) = ncall_cholesky(id_thread) + 1
+                      ! DEBUG end
                    else
                       ! Use CG iteration to apply the preconditioner
+                      t1 = MPI_Wtime()  ! DEBUG
                       call apply_CG_preconditioner( &
                            noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), &
                            z(0, kstart+1, idet), r(0, kstart+1, idet), &
-                           invcov(1, ipsd), fx, xx, tf)
+                           invcov(1, ipsd), fx, xx, tf, niter)
+                      ! DEBUG begin
+                      t2 = MPI_Wtime()
+                      time_cgprecond(id_thread) = time_cgprecond(id_thread) + (t2 - t1)
+                      time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf
+                      niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter
+                      ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1
+                      ! DEBUG end
                    end if
                 end if
                 kstart = kstart + noba
@@ -1237,6 +1267,19 @@ SUBROUTINE preconditioning_band(z, r, nna)
        end do
     end if
 
+    ! DEBUG begin
+    do id_thread = 0, nthreads - 1
+       write(1000 + id, *) "step", istep, "thread", id_thread, &
+            "time_cholesky", time_cholesky(id_thread), &
+            "ncall_cholesky", ncall_cholesky(id_thread)
+       write(1000 + id, *) "step", istep, "thread", id_thread, &
+            "time_cgprecond", time_cgprecond(id_thread), &
+            "ncall_cgprecond", ncall_cgprecond(id_thread), &
+            "niter_cgprecond", niter_cgprecond(id_thread), &
+            "time_cgprecond_filter", time_cgprecond_filter(id_thread)
+    end do
+    ! DEBUG end
+
     cputime_precond = cputime_precond + get_time(16)
 
   END SUBROUTINE preconditioning_band
@@ -1289,7 +1332,7 @@ end subroutine get_cholesky_decomposition
 
 
   subroutine apply_CG_preconditioner( &
-       noba, nof, fcov, nna, x, b, invcov, fx, xx, tf)
+       noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, niter)
     !
     ! Apply the preconditioner using CG iteration
     !
@@ -1298,9 +1341,10 @@ subroutine apply_CG_preconditioner( &
     real(dp), intent(in) :: nna(noba), invcov(noba)
     real(dp), intent(out) :: x(noba)
     real(dp), intent(in) :: b(noba)
-    real(dp), intent(inout) :: tf
     real(C_DOUBLE), intent(inout) :: xx(nof)
     complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1)
+    real(dp), intent(inout) :: tf
+    integer, intent(inout) :: niter
 
     real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:)
     real(dp) :: alpha, beta, rr, rr0, rr_old, rz, rz_old, Anorm
@@ -1310,6 +1354,7 @@ subroutine apply_CG_preconditioner( &
     ! Polak-Ribiere formula to allow for a changing preconditioner
     real(dp), parameter :: cglimit = 1e-6
     integer, parameter :: itermax = 100
+    real(dp) :: tfilter
 
     allocate(resid(noba), prop(noba), Aprop(noba), zresid(noba), &
          precond(noba), stat=ierr)
@@ -1328,8 +1373,9 @@ subroutine apply_CG_preconditioner( &
 
     prop = zresid
     iter = 1
+    tfilter = 0
     do
-       call apply_A(prop, Aprop, tf)
+       call apply_A(prop, Aprop)
        Anorm = dot_product(prop, Aprop)
        alpha = rz / Anorm
        x = x + alpha * prop
@@ -1348,6 +1394,9 @@ subroutine apply_CG_preconditioner( &
 
     deallocate(resid, prop, Aprop, zresid, precond)
 
+    tf = tfilter
+    niter = iter
+
   contains
 
     subroutine apply_precond(x, z)
@@ -1360,34 +1409,33 @@ subroutine apply_precond(x, z)
       z = precond * x
     end subroutine apply_precond
 
-    subroutine apply_A(x, y, tf)
+    subroutine apply_A(x, y)
       !
       ! Apply the square matrix `A` to `x` and place the result in `y`
       !
       real(dp), intent(in) :: x(noba)
       real(dp), intent(out) :: y(noba)
-      real(dp), intent(inout) :: tf
-      call convolve(x, fcov, y, tf)
-      y = y + nna*x
+      call convolve(x, fcov, y)
+      y = y + nna * x
     end subroutine apply_A
 
-    subroutine convolve(x, fc, y, tf)
+    subroutine convolve(x, fc, y)
       !
       ! Convolve `x` with filter `fc` and place result in `y`.
       !
       real(dp), intent(in) :: x(noba)
-      complex(dp) :: fc(nof/2 + 1)
+      complex(dp) :: fc(nof / 2 + 1)
       real(dp), intent(out) :: y(noba)
-      real(dp), intent(inout) :: tf
-      real(dp) :: t1, t2
+      real(dp) :: t
       xx(:noba) = x - sum(x)/noba
-      xx(noba+1:) = 0
-      !t1 = get_time(56)
+      xx(noba + 1:) = 0
+      t = MPI_Wtime()
       call dfft(fx, xx)
-      fx = fx*fc
+      tfilter = tfilter + (MPI_Wtime() - t)
+      fx = fx * fc
+      t = MPI_Wtime()
       call dfftinv(xx, fx)
-      !t2 = get_time(56)
-      !tf = tf + t2 - t1
+      tfilter = tfilter + (MPI_Wtime() - t)
       y = xx(:noba)
       y = y - sum(y) / noba
     end subroutine convolve
diff --git a/src/smadam_routines.f90 b/src/smadam_routines.f90
index 534ae62..9defdaa 100644
--- a/src/smadam_routines.f90
+++ b/src/smadam_routines.f90
@@ -701,6 +701,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
     real(dp), allocatable, target :: ap_all_threads(:, :, :, :)
     real(dp), pointer :: ap_thread(:, :, :)
     real(dp), allocatable :: resid(:)
+    real(dp) :: t0, t1, t2, t3
 
     if (info > 4) write(*,idf) ID,'Begin iteration...'
 
@@ -730,7 +731,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
 
     r = yba
 
-    call apply_preconditioner(z, r)
+    call apply_preconditioner(z, r, -1)
 
     p = z
     rz = sum(r * z, mask=rmask)
@@ -786,6 +787,8 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
        if (istep >= iter_max) exit
        istep = istep + 1
 
+       t1 = MPI_Wtime()  ! DEBUG
+
        call reset_time(12)
 
        ! 1) evaluate A.p
@@ -810,23 +813,33 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
           end if
        end if
 
+       t2 = MPI_Wtime()  ! DEBUG
+       write(1000 + id, *) "step", istep, "baseline_to_map", t2 - t1  ! DEBUG
+
        ! Communicate maps between processes
        call wait_mpi
        cputime_cga_1 = cputime_cga_1 + get_time_and_reset(12)
 
        wamap = 0
+       t1 = MPI_Wtime()  ! DEBUG
        call collect_map(wamap, nosubpix_cross, .true.) ! locmap -> wamap
+       write(1000 + id, *) "step", istep, "collect_map", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_mpi_reduce = cputime_cga_mpi_reduce + get_time_and_reset(12)
        ! apply cca, rejects masked pixels
+       t1 = MPI_Wtime()  ! DEBUG
        call ccmultiply(cca, wamap, nopix_cross)
+       write(1000 + id, *) "step", istep, "ccmultiply", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_cc = cputime_cga_cc + get_time_and_reset(12)
+       t1 = MPI_Wtime()  ! DEBUG
        call scatter_map(wamap, nosubpix_cross) ! wamap -> locmap
+       write(1000 + id, *) "step", istep, "scatter_map", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_mpi_scatter = cputime_cga_mpi_scatter &
             + get_time_and_reset(12)
 
+       t1 = MPI_Wtime()  ! DEBUG
        if (kfilter) then
           call cinvmul(ap, p, nna)
        else
@@ -843,11 +856,13 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
              end do
           end if
        end if
+       write(1000 + id, *) "step", istep, "cinvmul", MPI_Wtime() - t1  ! DEBUG
 
        ! From map to baseline
 
        call reset_time(12)
 
+       t1 = MPI_Wtime()  ! DEBUG
        ap_all_threads = 0
 
        if (basis_order == 0) then
@@ -868,10 +883,12 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
 
        ap = ap + sum(ap_all_threads, dim=4)
 
+       write(1000 + id, *) "step", istep, "map_to_baseline", MPI_Wtime() - t1  ! DEBUG
        cputime_cga_2 = cputime_cga_2 + get_time(12)
 
        ! 2) Evaluate p^T.A.p
 
+       t1 = MPI_Wtime()  ! DEBUG
        pap = sum(p*ap, mask=rmask)
        call sum_mpi(pap)
 
@@ -886,12 +903,16 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
             aa(:, 1:noba_short, 1:nodetectors) + alpha*p
        r = r - alpha*ap
 
+       write(1000 + id, *) "step", istep, "update_guess", MPI_Wtime() - t1  ! DEBUG
        ! 5) Precondition
 
-       call apply_preconditioner(z, r)
+       t1 = MPI_Wtime()  ! DEBUG
+       call apply_preconditioner(z, r, istep)
+       write(1000 + id, *) "step", istep, "precondition", MPI_Wtime() - t1  ! DEBUG
 
        ! 6) Check for convergence
 
+       t1 = MPI_Wtime()  ! DEBUG
        rzo = rz
        rz = sum(r * z, mask=rmask)
        rz2 = sum(ro * z, mask=rmask)
@@ -905,6 +926,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
        ! This is the Polak-Ribiere formula that
        ! allows for updates to the preconditioner
        beta = (rz - rz2) / rzo
+       write(1000 + id, *) "step", istep, "check_convergence", MPI_Wtime() - t1  ! DEBUG
 
        if (ID==0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') &
             istep, rr/rrinit, rz2/rz, alpha, beta, get_time_and_reset(99)
@@ -936,12 +958,13 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
 
   contains
 
-    subroutine apply_preconditioner(z, r)
+    subroutine apply_preconditioner(z, r, istep)
       real(dp), intent(in) :: r(0:basis_order, noba_short, nodetectors)
       real(dp), intent(out) :: z(0:basis_order, noba_short, nodetectors)
+      integer, intent(in) :: istep
       integer :: i, idet
       if (basis_order == 0) then
-         call preconditioning_band(z, r, nna)
+         call preconditioning_band(z, r, nna, istep)
       else
          do idet = 1, nodetectors
             do i = 1, noba_short

From 8f290c238cc02b90771848c3368510b1b100dbe1 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Mon, 19 Aug 2019 12:39:18 -0700
Subject: [PATCH 02/17] Fix index error

---
 src/noise_routines.f90 | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 3e4bacc..7d3eae4 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -1157,8 +1157,8 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
     type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1)
     ! DEBUG begin
-    real(dp) :: time_cholesky(100), time_cgprecond(100), time_cgprecond_filter(100)
-    integer :: ncall_cholesky(100), ncall_cgprecond(100), niter_cgprecond(100)
+    real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), time_cgprecond_filter(:)
+    integer, allocatable :: ncall_cholesky(:), ncall_cgprecond(:), niter_cgprecond(:)
     ! DEBUG end
 
     z = r
@@ -1169,6 +1169,18 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
     call reset_time(16)
 
+    ! DEBUG begin
+    allocate(time_cholesky(0:nthreads-1), time_cgprecond(0:nthreads-1), &
+         time_cgprecond_filter(0:nthreads-1), ncall_cholesky(0:nthreads-1), &
+         ncall_cgprecond(0:nthreads-1), niter_cgprecond(0:nthreads-1))
+    time_cholesky = 0
+    time_cgprecond = 0
+    time_cgprecond_filter = 0
+    ncall_cholesky = 0
+    ncall_cgprecond = 0
+    niter_cgprecond = 0
+    ! DEBUG end
+
     if (use_diagonal) then
        z(0, :, :) = r(0, :, :) * prec_diag
     else if (use_fprecond) then
@@ -1182,15 +1194,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
           pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
        end do
 
-       ! DEBUG begin
-       time_cholesky = 0
-       time_cgprecond = 0
-       time_cgprecond_filter = 0
-       ncall_cholesky = 0
-       ncall_cgprecond = 0
-       niter_cgprecond = 0
-       ! DEBUG end
-
        !$OMP PARALLEL NUM_THREADS(nthreads) &
        !$OMP     DEFAULT(NONE) &
        !$OMP     SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, &
@@ -1278,6 +1281,8 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
             "niter_cgprecond", niter_cgprecond(id_thread), &
             "time_cgprecond_filter", time_cgprecond_filter(id_thread)
     end do
+    deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, &
+         ncall_cholesky, ncall_cgprecond, niter_cgprecond)
     ! DEBUG end
 
     cputime_precond = cputime_precond + get_time(16)

From f992fba18eb5bd780f3e9c9d1feb1e25bdae5973 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Mon, 19 Aug 2019 13:34:46 -0700
Subject: [PATCH 03/17] clean output and add barriers

---
 src/noise_routines.f90  | 32 ++++++++++++++++++--------------
 src/smadam_routines.f90 | 39 +++++++++++++++++++++------------------
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 7d3eae4..08fd834 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -1151,7 +1151,7 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
     integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, &
          num_threads, m, nband, ipsd, isub, niter
-    real(dp) :: t1, t2, tf
+    real(dp) :: t1, t2, tf, t_fftw_memory
 
     real(C_DOUBLE), pointer :: xx(:) => NULL()
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
@@ -1189,10 +1189,12 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
     else
        ! Allocate FFTW work space outside the threaded region to avoid
        ! a GCC compiler segfault when freeing the workspace
+       t1 = MPI_Wtime()  ! DEBUG
        do id_thread = 0, nthreads - 1
           pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
           pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
        end do
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_alloc", MPI_Wtime() - t1  ! DEBUG
 
        !$OMP PARALLEL NUM_THREADS(nthreads) &
        !$OMP     DEFAULT(NONE) &
@@ -1236,8 +1238,7 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
                            noba, z(0, kstart+1, idet), r(0, kstart+1, idet), &
                            nband, bandprec(isub, ichunk, idet)%data)
                       ! DEBUG begin
-                      t2 = MPI_Wtime()
-                      time_cholesky(id_thread) = time_cholesky(id_thread) + (t2 - t1)
+                      time_cholesky(id_thread) = time_cholesky(id_thread) + (MPI_Wtime() - t1)
                       ncall_cholesky(id_thread) = ncall_cholesky(id_thread) + 1
                       ! DEBUG end
                    else
@@ -1248,8 +1249,7 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
                            z(0, kstart+1, idet), r(0, kstart+1, idet), &
                            invcov(1, ipsd), fx, xx, tf, niter)
                       ! DEBUG begin
-                      t2 = MPI_Wtime()
-                      time_cgprecond(id_thread) = time_cgprecond(id_thread) + (t2 - t1)
+                      time_cgprecond(id_thread) = time_cgprecond(id_thread) + (MPI_Wtime() - t1)
                       time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf
                       niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter
                       ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1
@@ -1264,22 +1264,26 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
        !$OMP END PARALLEL
 
+       t1 = MPI_Wtime()  ! DEBUG
        do id_thread = 0, nthreads - 1
           call fftw_free(pxx(id_thread))
           call fftw_free(pfx(id_thread))
        end do
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_free", MPI_Wtime() - t1  ! DEBUG
     end if
 
     ! DEBUG begin
     do id_thread = 0, nthreads - 1
-       write(1000 + id, *) "step", istep, "thread", id_thread, &
-            "time_cholesky", time_cholesky(id_thread), &
-            "ncall_cholesky", ncall_cholesky(id_thread)
-       write(1000 + id, *) "step", istep, "thread", id_thread, &
-            "time_cgprecond", time_cgprecond(id_thread), &
-            "ncall_cgprecond", ncall_cgprecond(id_thread), &
-            "niter_cgprecond", niter_cgprecond(id_thread), &
-            "time_cgprecond_filter", time_cgprecond_filter(id_thread)
+       write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i8)') &
+            "step", istep, " thread", id_thread, &
+            " time_cholesky", time_cholesky(id_thread), &
+            " ncall_cholesky", ncall_cholesky(id_thread)
+       write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2)') &
+            "step", istep, " thread", id_thread, &
+            " time_cgprecond", time_cgprecond(id_thread), &
+            " ncall_cgprecond", ncall_cgprecond(id_thread), &
+            " niter_cgprecond", niter_cgprecond(id_thread), &
+            " time_cgprecond_filter", time_cgprecond_filter(id_thread)
     end do
     deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, &
          ncall_cholesky, ncall_cgprecond, niter_cgprecond)
@@ -1432,7 +1436,7 @@ subroutine convolve(x, fc, y)
       complex(dp) :: fc(nof / 2 + 1)
       real(dp), intent(out) :: y(noba)
       real(dp) :: t
-      xx(:noba) = x - sum(x)/noba
+      xx(:noba) = x - sum(x) / noba
       xx(noba + 1:) = 0
       t = MPI_Wtime()
       call dfft(fx, xx)
diff --git a/src/smadam_routines.f90 b/src/smadam_routines.f90
index 9defdaa..63a360b 100644
--- a/src/smadam_routines.f90
+++ b/src/smadam_routines.f90
@@ -814,7 +814,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
        end if
 
        t2 = MPI_Wtime()  ! DEBUG
-       write(1000 + id, *) "step", istep, "baseline_to_map", t2 - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "baseline_to_map", t2 - t1  ! DEBUG
 
        ! Communicate maps between processes
        call wait_mpi
@@ -823,18 +823,18 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
        wamap = 0
        t1 = MPI_Wtime()  ! DEBUG
        call collect_map(wamap, nosubpix_cross, .true.) ! locmap -> wamap
-       write(1000 + id, *) "step", istep, "collect_map", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "collect_map", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_mpi_reduce = cputime_cga_mpi_reduce + get_time_and_reset(12)
        ! apply cca, rejects masked pixels
        t1 = MPI_Wtime()  ! DEBUG
        call ccmultiply(cca, wamap, nopix_cross)
-       write(1000 + id, *) "step", istep, "ccmultiply", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "ccmultiply", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_cc = cputime_cga_cc + get_time_and_reset(12)
        t1 = MPI_Wtime()  ! DEBUG
        call scatter_map(wamap, nosubpix_cross) ! wamap -> locmap
-       write(1000 + id, *) "step", istep, "scatter_map", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "scatter_map", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_mpi_scatter = cputime_cga_mpi_scatter &
             + get_time_and_reset(12)
@@ -856,7 +856,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
              end do
           end if
        end if
-       write(1000 + id, *) "step", istep, "cinvmul", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "cinvmul", MPI_Wtime() - t1  ! DEBUG
 
        ! From map to baseline
 
@@ -883,35 +883,38 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
 
        ap = ap + sum(ap_all_threads, dim=4)
 
-       write(1000 + id, *) "step", istep, "map_to_baseline", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "map_to_baseline", MPI_Wtime() - t1  ! DEBUG
        cputime_cga_2 = cputime_cga_2 + get_time(12)
 
        ! 2) Evaluate p^T.A.p
 
+       call wait_mpi  ! DEBUB
        t1 = MPI_Wtime()  ! DEBUG
-       pap = sum(p*ap, mask=rmask)
+       pap = sum(p * ap, mask=rmask)
        call sum_mpi(pap)
 
        ! 3) alpha = r.z / (p^T.A.p)
 
-       alpha = rz/pap
+       alpha = rz / pap
        ro = r ! Keep a copy for Polak-Ribiere beta
 
        ! 4) update `aa` and `r`
 
        aa(:, 1:noba_short, 1:nodetectors) = &
-            aa(:, 1:noba_short, 1:nodetectors) + alpha*p
-       r = r - alpha*ap
+            aa(:, 1:noba_short, 1:nodetectors) + alpha * p
+       r = r - alpha * ap
 
-       write(1000 + id, *) "step", istep, "update_guess", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "update_guess", MPI_Wtime() - t1  ! DEBUG
        ! 5) Precondition
 
+       call wait_mpi  ! DEBUB
        t1 = MPI_Wtime()  ! DEBUG
        call apply_preconditioner(z, r, istep)
-       write(1000 + id, *) "step", istep, "precondition", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "precondition", MPI_Wtime() - t1  ! DEBUG
 
        ! 6) Check for convergence
 
+       call wait_mpi  ! DEBUB
        t1 = MPI_Wtime()  ! DEBUG
        rzo = rz
        rz = sum(r * z, mask=rmask)
@@ -922,22 +925,22 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
        call sum_mpi(rr)
        ! This is the Fletcher-Reeves formula that
        ! assumes stationary preconditioning
-       !beta = rz / rzo
+       ! beta = rz / rzo
        ! This is the Polak-Ribiere formula that
        ! allows for updates to the preconditioner
        beta = (rz - rz2) / rzo
-       write(1000 + id, *) "step", istep, "check_convergence", MPI_Wtime() - t1  ! DEBUG
+       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "check_convergence", MPI_Wtime() - t1  ! DEBUG
 
        if (ID==0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') &
-            istep, rr/rrinit, rz2/rz, alpha, beta, get_time_and_reset(99)
+            istep, rr / rrinit, rz2 / rz, alpha, beta, get_time_and_reset(99)
 
-       if (rr/rrinit > 1e3) call abort_mpi('CG is diverging')
-       if (rr/rrinit < cglimit .and. istep > iter_min) exit
+       if (rr / rrinit > 1e3) call abort_mpi('CG is diverging')
+       if (rr / rrinit < cglimit .and. istep > iter_min) exit
        if (rz == 0) exit
 
        ! 7) Update search direction, `p`
 
-       p = z + beta*p
+       p = z + beta * p
 
     end do
 

From 8c71e1a7d5b685f82405cda28f4fa34c72e7dc7c Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Mon, 19 Aug 2019 14:04:28 -0700
Subject: [PATCH 04/17] time convolution

---
 src/noise_routines.f90 | 45 +++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 08fd834..53fdc44 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -1151,13 +1151,14 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
     integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, &
          num_threads, m, nband, ipsd, isub, niter
-    real(dp) :: t1, t2, tf, t_fftw_memory
+    real(dp) :: t1, t2, tf, t_fftw_memory, tc
 
     real(C_DOUBLE), pointer :: xx(:) => NULL()
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
     type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1)
     ! DEBUG begin
-    real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), time_cgprecond_filter(:)
+    real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), &
+         time_cgprecond_filter(:), time_cgprecond_convolve(:)
     integer, allocatable :: ncall_cholesky(:), ncall_cgprecond(:), niter_cgprecond(:)
     ! DEBUG end
 
@@ -1171,11 +1172,13 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
     ! DEBUG begin
     allocate(time_cholesky(0:nthreads-1), time_cgprecond(0:nthreads-1), &
-         time_cgprecond_filter(0:nthreads-1), ncall_cholesky(0:nthreads-1), &
+         time_cgprecond_filter(0:nthreads-1), time_cgprecond_convolve(0:nthreads-1), &
+         ncall_cholesky(0:nthreads-1), &
          ncall_cgprecond(0:nthreads-1), niter_cgprecond(0:nthreads-1))
     time_cholesky = 0
     time_cgprecond = 0
     time_cgprecond_filter = 0
+    time_cgprecond_convolve = 0
     ncall_cholesky = 0
     ncall_cgprecond = 0
     niter_cgprecond = 0
@@ -1202,10 +1205,10 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
        !$OMP         id, nof, nshort, detectors, nthreads, fprecond, nna, &
        !$OMP         baselines_short_time, invcov, fcov, checknan, pxx, pfx, &
        !$OMP         time_cholesky, ncall_cholesky, time_cgprecond, &
-       !$OMP         ncall_cgprecond, niter_cgprecond, time_cgprecond_filter) &
+       !$OMP         ncall_cgprecond, niter_cgprecond, time_cgprecond_filter, time_cgprecond_convolve) &
        !$OMP     PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, &
        !$OMP         xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, &
-       !$OMP         t1, t2, tf, niter)
+       !$OMP         t1, t2, tf, tc, niter)
 
        !t1 = get_time(55)
        !tf = 0
@@ -1247,10 +1250,11 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
                       call apply_CG_preconditioner( &
                            noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), &
                            z(0, kstart+1, idet), r(0, kstart+1, idet), &
-                           invcov(1, ipsd), fx, xx, tf, niter)
+                           invcov(1, ipsd), fx, xx, tf, tc, niter)
                       ! DEBUG begin
                       time_cgprecond(id_thread) = time_cgprecond(id_thread) + (MPI_Wtime() - t1)
                       time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf
+                      time_cgprecond_convolve(id_thread) = time_cgprecond_convolve(id_thread) + tc
                       niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter
                       ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1
                       ! DEBUG end
@@ -1278,12 +1282,13 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
             "step", istep, " thread", id_thread, &
             " time_cholesky", time_cholesky(id_thread), &
             " ncall_cholesky", ncall_cholesky(id_thread)
-       write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2)') &
+       write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2,a,x,f8.2)') &
             "step", istep, " thread", id_thread, &
             " time_cgprecond", time_cgprecond(id_thread), &
             " ncall_cgprecond", ncall_cgprecond(id_thread), &
             " niter_cgprecond", niter_cgprecond(id_thread), &
-            " time_cgprecond_filter", time_cgprecond_filter(id_thread)
+            " time_cgprecond_filter", time_cgprecond_filter(id_thread), &
+            " time_cgprecond_convolve", time_cgprecond_convolve(id_thread)
     end do
     deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, &
          ncall_cholesky, ncall_cgprecond, niter_cgprecond)
@@ -1341,7 +1346,7 @@ end subroutine get_cholesky_decomposition
 
 
   subroutine apply_CG_preconditioner( &
-       noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, niter)
+       noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, tc, niter)
     !
     ! Apply the preconditioner using CG iteration
     !
@@ -1352,7 +1357,7 @@ subroutine apply_CG_preconditioner( &
     real(dp), intent(in) :: b(noba)
     real(C_DOUBLE), intent(inout) :: xx(nof)
     complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1)
-    real(dp), intent(inout) :: tf
+    real(dp), intent(inout) :: tf, tc
     integer, intent(inout) :: niter
 
     real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:)
@@ -1363,7 +1368,7 @@ subroutine apply_CG_preconditioner( &
     ! Polak-Ribiere formula to allow for a changing preconditioner
     real(dp), parameter :: cglimit = 1e-6
     integer, parameter :: itermax = 100
-    real(dp) :: tfilter
+    real(dp) :: tfilter, tconv  ! DEBUG
 
     allocate(resid(noba), prop(noba), Aprop(noba), zresid(noba), &
          precond(noba), stat=ierr)
@@ -1382,7 +1387,10 @@ subroutine apply_CG_preconditioner( &
 
     prop = zresid
     iter = 1
+    ! DEBUG begin
     tfilter = 0
+    tconv = 0
+    ! DEBUG end
     do
        call apply_A(prop, Aprop)
        Anorm = dot_product(prop, Aprop)
@@ -1403,8 +1411,11 @@ subroutine apply_CG_preconditioner( &
 
     deallocate(resid, prop, Aprop, zresid, precond)
 
+    ! DEBUG begin
     tf = tfilter
+    tc = tconv
     niter = iter
+    ! DEBUG end
 
   contains
 
@@ -1435,18 +1446,20 @@ subroutine convolve(x, fc, y)
       real(dp), intent(in) :: x(noba)
       complex(dp) :: fc(nof / 2 + 1)
       real(dp), intent(out) :: y(noba)
-      real(dp) :: t
+      real(dp) :: t1, t2
+      t1 = MPI_Wtime()  ! DEBUG
       xx(:noba) = x - sum(x) / noba
       xx(noba + 1:) = 0
-      t = MPI_Wtime()
+      t2 = MPI_Wtime()  ! DEBUG
       call dfft(fx, xx)
-      tfilter = tfilter + (MPI_Wtime() - t)
+      tfilter = tfilter + (MPI_Wtime() - t2)  ! DEBUG
       fx = fx * fc
-      t = MPI_Wtime()
+      t2 = MPI_Wtime()  ! DEBUG
       call dfftinv(xx, fx)
-      tfilter = tfilter + (MPI_Wtime() - t)
+      tfilter = tfilter + (MPI_Wtime() - t2)  ! DEBUG
       y = xx(:noba)
       y = y - sum(y) / noba
+      tconv = tconv + (MPI_Wtime() - t1)  ! DEBUG
     end subroutine convolve
 
   end subroutine apply_CG_preconditioner

From 920676d54c4d3351c2281fe600bef8c32dfa4e21 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Mon, 19 Aug 2019 15:09:20 -0700
Subject: [PATCH 05/17] allocate thread work space only once

---
 src/noise_routines.f90 | 73 +++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 44 deletions(-)

diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 53fdc44..18d1196 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -41,6 +41,9 @@ MODULE noise_routines
   type(bandprec_pointer), allocatable :: bandprec(:, :, :)
   real(dp), allocatable, target :: prec_diag(:, :)
 
+  ! Pointers for FFT workspace
+  type(C_PTR), allocatable :: pxx(:), pfx(:)
+
   ! PSD information
 
   integer :: npsdtot
@@ -235,6 +238,7 @@ end function psd_index_det
   SUBROUTINE init_filter
 
     integer :: nof_min, ierr
+    real(dp) :: t1
 
     if (.not. kfilter) return
 
@@ -247,17 +251,30 @@ SUBROUTINE init_filter
     nof = 1
     do
        if (nof >= nof_min) exit
-       nof = 2*nof
+       nof = 2 * nof
     end do
     if (ID == 0) write(*,*) 'FFT length =', nof
 
     !allocate(fx(nof/2+1), xx(nof), stat=ierr) ! Work space
     !if (ierr /= 0) call abort_mpi('No room for fx and xx')
 
-    memory_filter = ((nof/2+1)*16. + nof*8) * nthreads
+    memory_filter = ((nof / 2 + 1) * 16. + nof * 8) * nthreads
 
     call init_fourier(nof)
 
+    ! Permanently allocate space for FFT for each OpenMP thread
+
+    t1 = MPI_Wtime()  ! DEBUG
+    allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr)
+    if (ierr /= 0) stop 'Failed to allocate pxx and pfx'
+    pxx = C_NULL_PTR
+    pfx = C_NULL_PTR
+    do id_thread = 0, nthreads - 1
+       pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
+       pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
+    end do
+    write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", -1, "fftw_alloc", MPI_Wtime() - t1  ! DEBUG
+
   END SUBROUTINE init_filter
 
 
@@ -295,6 +312,8 @@ end subroutine free_bandprec
 
   SUBROUTINE close_filter
 
+    real(dp) :: t1
+
     if (.not. kfilter) return
 
     if (allocated(fcov)) deallocate(fcov)
@@ -311,6 +330,14 @@ SUBROUTINE close_filter
 
     call close_fourier
 
+    t1 = MPI_Wtime()  ! DEBUG
+    do id_thread = 0, nthreads - 1
+       call fftw_free(pxx(id_thread))
+       call fftw_free(pfx(id_thread))
+    end do
+    deallocate(pxx, pfx)
+    write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", 999, "fftw_free", MPI_Wtime() - t1  ! DEBUG
+
     nof = -1
 
   END SUBROUTINE close_filter
@@ -833,19 +860,11 @@ SUBROUTINE convolve_pp(y, x, fc, nna)
 
     real(C_DOUBLE), pointer :: xx(:) => NULL()
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
-    type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1)
 
     call reset_time(14)
 
     y = x
 
-    ! Allocate FFTW work space outside the threaded region to avoid
-    ! a GCC compiler segfault when freeing the workspace
-    do id_thread = 0, nthreads - 1
-       pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
-       pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
-    end do
-
     !$OMP PARALLEL NUM_THREADS(nthreads) &
     !$OMP     DEFAULT(NONE) &
     !$OMP     SHARED(nof, nodetectors, ninterval, noba_short_pp, &
@@ -870,11 +889,6 @@ SUBROUTINE convolve_pp(y, x, fc, nna)
 
     !$OMP END PARALLEL
 
-    do id_thread = 0, nthreads - 1
-       call fftw_free(pxx(id_thread))
-       call fftw_free(pfx(id_thread))
-    end do
-
     cputime_filter = cputime_filter + get_time(14)
 
   END SUBROUTINE convolve_pp
@@ -897,7 +911,6 @@ SUBROUTINE construct_preconditioner(nna)
 
     real(C_DOUBLE), pointer :: xx(:) => NULL()
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
-    type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1)
 
     if (precond_width_max < 1) return
 
@@ -969,13 +982,6 @@ SUBROUTINE construct_preconditioner(nna)
 
     call allocate_bandprec
 
-    ! Allocate FFTW work space outside the threaded region to avoid
-    ! a GCC compiler segfault when freeing the workspace
-    do id_thread = 0, nthreads - 1
-       pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
-       pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
-    end do
-
     !$OMP PARALLEL NUM_THREADS(nthreads) &
     !$OMP     DEFAULT(NONE) &
     !$OMP     SHARED(nof, npsdtot, fcov, invcov, pxx, pfx) &
@@ -996,11 +1002,6 @@ SUBROUTINE construct_preconditioner(nna)
 
     !$OMP END PARALLEL
 
-    do id_thread = 0, nthreads - 1
-       call fftw_free(pxx(id_thread))
-       call fftw_free(pfx(id_thread))
-    end do
-
     if (.not. use_cgprecond) then
        nempty = 0
        nfail = 0
@@ -1155,7 +1156,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
     real(C_DOUBLE), pointer :: xx(:) => NULL()
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
-    type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1)
     ! DEBUG begin
     real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), &
          time_cgprecond_filter(:), time_cgprecond_convolve(:)
@@ -1190,15 +1190,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
        ! Apply the filter-based preconditioner
        call convolve_pp(z, r, fprecond, nna)
     else
-       ! Allocate FFTW work space outside the threaded region to avoid
-       ! a GCC compiler segfault when freeing the workspace
-       t1 = MPI_Wtime()  ! DEBUG
-       do id_thread = 0, nthreads - 1
-          pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
-          pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
-       end do
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_alloc", MPI_Wtime() - t1  ! DEBUG
-
        !$OMP PARALLEL NUM_THREADS(nthreads) &
        !$OMP     DEFAULT(NONE) &
        !$OMP     SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, &
@@ -1268,12 +1259,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
        !$OMP END PARALLEL
 
-       t1 = MPI_Wtime()  ! DEBUG
-       do id_thread = 0, nthreads - 1
-          call fftw_free(pxx(id_thread))
-          call fftw_free(pfx(id_thread))
-       end do
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "fftw_free", MPI_Wtime() - t1  ! DEBUG
     end if
 
     ! DEBUG begin

From a0ba8534712488b2fd3d74e0338e5fb933154583 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Mon, 19 Aug 2019 16:13:46 -0700
Subject: [PATCH 06/17] Remove debug statements

---
 src/noise_routines.f90  | 93 ++++++-----------------------------------
 src/smadam_routines.f90 | 23 +---------
 2 files changed, 13 insertions(+), 103 deletions(-)

diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 18d1196..270a2e1 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -264,7 +264,6 @@ SUBROUTINE init_filter
 
     ! Permanently allocate space for FFT for each OpenMP thread
 
-    t1 = MPI_Wtime()  ! DEBUG
     allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr)
     if (ierr /= 0) stop 'Failed to allocate pxx and pfx'
     pxx = C_NULL_PTR
@@ -273,7 +272,6 @@ SUBROUTINE init_filter
        pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
        pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
     end do
-    write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", -1, "fftw_alloc", MPI_Wtime() - t1  ! DEBUG
 
   END SUBROUTINE init_filter
 
@@ -330,13 +328,11 @@ SUBROUTINE close_filter
 
     call close_fourier
 
-    t1 = MPI_Wtime()  ! DEBUG
     do id_thread = 0, nthreads - 1
        call fftw_free(pxx(id_thread))
        call fftw_free(pfx(id_thread))
     end do
     deallocate(pxx, pfx)
-    write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", 999, "fftw_free", MPI_Wtime() - t1  ! DEBUG
 
     nof = -1
 
@@ -1151,16 +1147,11 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
     integer, intent(in) :: istep
 
     integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, &
-         num_threads, m, nband, ipsd, isub, niter
-    real(dp) :: t1, t2, tf, t_fftw_memory, tc
+         num_threads, m, nband, ipsd, isub
+    real(dp) :: t1, t2, tf
 
     real(C_DOUBLE), pointer :: xx(:) => NULL()
     complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL()
-    ! DEBUG begin
-    real(dp), allocatable :: time_cholesky(:), time_cgprecond(:), &
-         time_cgprecond_filter(:), time_cgprecond_convolve(:)
-    integer, allocatable :: ncall_cholesky(:), ncall_cgprecond(:), niter_cgprecond(:)
-    ! DEBUG end
 
     z = r
 
@@ -1170,20 +1161,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
     call reset_time(16)
 
-    ! DEBUG begin
-    allocate(time_cholesky(0:nthreads-1), time_cgprecond(0:nthreads-1), &
-         time_cgprecond_filter(0:nthreads-1), time_cgprecond_convolve(0:nthreads-1), &
-         ncall_cholesky(0:nthreads-1), &
-         ncall_cgprecond(0:nthreads-1), niter_cgprecond(0:nthreads-1))
-    time_cholesky = 0
-    time_cgprecond = 0
-    time_cgprecond_filter = 0
-    time_cgprecond_convolve = 0
-    ncall_cholesky = 0
-    ncall_cgprecond = 0
-    niter_cgprecond = 0
-    ! DEBUG end
-
     if (use_diagonal) then
        z(0, :, :) = r(0, :, :) * prec_diag
     else if (use_fprecond) then
@@ -1194,12 +1171,10 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
        !$OMP     DEFAULT(NONE) &
        !$OMP     SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, &
        !$OMP         id, nof, nshort, detectors, nthreads, fprecond, nna, &
-       !$OMP         baselines_short_time, invcov, fcov, checknan, pxx, pfx, &
-       !$OMP         time_cholesky, ncall_cholesky, time_cgprecond, &
-       !$OMP         ncall_cgprecond, niter_cgprecond, time_cgprecond_filter, time_cgprecond_convolve) &
+       !$OMP         baselines_short_time, invcov, fcov, checknan, pxx, pfx) &
        !$OMP     PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, &
        !$OMP         xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, &
-       !$OMP         t1, t2, tf, tc, niter)
+       !$OMP         t1, t2, tf)
 
        !t1 = get_time(55)
        !tf = 0
@@ -1226,29 +1201,16 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
                    ipsd = psd_index(idet, baselines_short_time(kstart+1))
                    if (allocated(bandprec(isub, ichunk, idet)%data)) then
                       ! Use the precomputed Cholesky decomposition
-                      t1 = MPI_Wtime()  ! DEBUG
                       nband = size(bandprec(isub, ichunk, idet)%data, 1)
                       call apply_cholesky_decomposition( &
                            noba, z(0, kstart+1, idet), r(0, kstart+1, idet), &
                            nband, bandprec(isub, ichunk, idet)%data)
-                      ! DEBUG begin
-                      time_cholesky(id_thread) = time_cholesky(id_thread) + (MPI_Wtime() - t1)
-                      ncall_cholesky(id_thread) = ncall_cholesky(id_thread) + 1
-                      ! DEBUG end
                    else
                       ! Use CG iteration to apply the preconditioner
-                      t1 = MPI_Wtime()  ! DEBUG
                       call apply_CG_preconditioner( &
                            noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), &
                            z(0, kstart+1, idet), r(0, kstart+1, idet), &
-                           invcov(1, ipsd), fx, xx, tf, tc, niter)
-                      ! DEBUG begin
-                      time_cgprecond(id_thread) = time_cgprecond(id_thread) + (MPI_Wtime() - t1)
-                      time_cgprecond_filter(id_thread) = time_cgprecond_filter(id_thread) + tf
-                      time_cgprecond_convolve(id_thread) = time_cgprecond_convolve(id_thread) + tc
-                      niter_cgprecond(id_thread) = niter_cgprecond(id_thread) + niter
-                      ncall_cgprecond(id_thread) = ncall_cgprecond(id_thread) + 1
-                      ! DEBUG end
+                           invcov(1, ipsd), fx, xx, tf)
                    end if
                 end if
                 kstart = kstart + noba
@@ -1261,24 +1223,6 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
 
     end if
 
-    ! DEBUG begin
-    do id_thread = 0, nthreads - 1
-       write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i8)') &
-            "step", istep, " thread", id_thread, &
-            " time_cholesky", time_cholesky(id_thread), &
-            " ncall_cholesky", ncall_cholesky(id_thread)
-       write(1000 + id, '(a,x,i3,a,x,i3,a,x,f8.2,a,x,i6,a,x,i8,a,x,f8.2,a,x,f8.2)') &
-            "step", istep, " thread", id_thread, &
-            " time_cgprecond", time_cgprecond(id_thread), &
-            " ncall_cgprecond", ncall_cgprecond(id_thread), &
-            " niter_cgprecond", niter_cgprecond(id_thread), &
-            " time_cgprecond_filter", time_cgprecond_filter(id_thread), &
-            " time_cgprecond_convolve", time_cgprecond_convolve(id_thread)
-    end do
-    deallocate(time_cholesky, time_cgprecond, time_cgprecond_filter, &
-         ncall_cholesky, ncall_cgprecond, niter_cgprecond)
-    ! DEBUG end
-
     cputime_precond = cputime_precond + get_time(16)
 
   END SUBROUTINE preconditioning_band
@@ -1331,7 +1275,7 @@ end subroutine get_cholesky_decomposition
 
 
   subroutine apply_CG_preconditioner( &
-       noba, nof, fcov, nna, x, b, invcov, fx, xx, tf, tc, niter)
+       noba, nof, fcov, nna, x, b, invcov, fx, xx, tfilter)
     !
     ! Apply the preconditioner using CG iteration
     !
@@ -1342,8 +1286,7 @@ subroutine apply_CG_preconditioner( &
     real(dp), intent(in) :: b(noba)
     real(C_DOUBLE), intent(inout) :: xx(nof)
     complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1)
-    real(dp), intent(inout) :: tf, tc
-    integer, intent(inout) :: niter
+    real(dp), intent(out) :: tfilter
 
     real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:)
     real(dp) :: alpha, beta, rr, rr0, rr_old, rz, rz_old, Anorm
@@ -1353,7 +1296,6 @@ subroutine apply_CG_preconditioner( &
     ! Polak-Ribiere formula to allow for a changing preconditioner
     real(dp), parameter :: cglimit = 1e-6
     integer, parameter :: itermax = 100
-    real(dp) :: tfilter, tconv  ! DEBUG
 
     allocate(resid(noba), prop(noba), Aprop(noba), zresid(noba), &
          precond(noba), stat=ierr)
@@ -1372,10 +1314,7 @@ subroutine apply_CG_preconditioner( &
 
     prop = zresid
     iter = 1
-    ! DEBUG begin
     tfilter = 0
-    tconv = 0
-    ! DEBUG end
     do
        call apply_A(prop, Aprop)
        Anorm = dot_product(prop, Aprop)
@@ -1396,12 +1335,6 @@ subroutine apply_CG_preconditioner( &
 
     deallocate(resid, prop, Aprop, zresid, precond)
 
-    ! DEBUG begin
-    tf = tfilter
-    tc = tconv
-    niter = iter
-    ! DEBUG end
-
   contains
 
     subroutine apply_precond(x, z)
@@ -1431,20 +1364,18 @@ subroutine convolve(x, fc, y)
       real(dp), intent(in) :: x(noba)
       complex(dp) :: fc(nof / 2 + 1)
       real(dp), intent(out) :: y(noba)
-      real(dp) :: t1, t2
-      t1 = MPI_Wtime()  ! DEBUG
+      !real(dp) :: t
       xx(:noba) = x - sum(x) / noba
       xx(noba + 1:) = 0
-      t2 = MPI_Wtime()  ! DEBUG
+      !t = MPI_Wtime()
       call dfft(fx, xx)
-      tfilter = tfilter + (MPI_Wtime() - t2)  ! DEBUG
+      !tfilter = tfilter + (MPI_Wtime() - t)
       fx = fx * fc
-      t2 = MPI_Wtime()  ! DEBUG
+      !t = MPI_Wtime()
       call dfftinv(xx, fx)
-      tfilter = tfilter + (MPI_Wtime() - t2)  ! DEBUG
+      !tfilter = tfilter + (MPI_Wtime() - t)
       y = xx(:noba)
       y = y - sum(y) / noba
-      tconv = tconv + (MPI_Wtime() - t1)  ! DEBUG
     end subroutine convolve
 
   end subroutine apply_CG_preconditioner
diff --git a/src/smadam_routines.f90 b/src/smadam_routines.f90
index 63a360b..37686a3 100644
--- a/src/smadam_routines.f90
+++ b/src/smadam_routines.f90
@@ -787,8 +787,6 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
        if (istep >= iter_max) exit
        istep = istep + 1
 
-       t1 = MPI_Wtime()  ! DEBUG
-
        call reset_time(12)
 
        ! 1) evaluate A.p
@@ -813,33 +811,23 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
           end if
        end if
 
-       t2 = MPI_Wtime()  ! DEBUG
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "baseline_to_map", t2 - t1  ! DEBUG
-
        ! Communicate maps between processes
        call wait_mpi
        cputime_cga_1 = cputime_cga_1 + get_time_and_reset(12)
 
        wamap = 0
-       t1 = MPI_Wtime()  ! DEBUG
        call collect_map(wamap, nosubpix_cross, .true.) ! locmap -> wamap
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "collect_map", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_mpi_reduce = cputime_cga_mpi_reduce + get_time_and_reset(12)
        ! apply cca, rejects masked pixels
-       t1 = MPI_Wtime()  ! DEBUG
        call ccmultiply(cca, wamap, nopix_cross)
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "ccmultiply", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_cc = cputime_cga_cc + get_time_and_reset(12)
-       t1 = MPI_Wtime()  ! DEBUG
        call scatter_map(wamap, nosubpix_cross) ! wamap -> locmap
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "scatter_map", MPI_Wtime() - t1  ! DEBUG
        call wait_mpi
        cputime_cga_mpi_scatter = cputime_cga_mpi_scatter &
             + get_time_and_reset(12)
 
-       t1 = MPI_Wtime()  ! DEBUG
        if (kfilter) then
           call cinvmul(ap, p, nna)
        else
@@ -856,13 +844,11 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
              end do
           end if
        end if
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "cinvmul", MPI_Wtime() - t1  ! DEBUG
 
        ! From map to baseline
 
        call reset_time(12)
 
-       t1 = MPI_Wtime()  ! DEBUG
        ap_all_threads = 0
 
        if (basis_order == 0) then
@@ -883,13 +869,11 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
 
        ap = ap + sum(ap_all_threads, dim=4)
 
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "map_to_baseline", MPI_Wtime() - t1  ! DEBUG
        cputime_cga_2 = cputime_cga_2 + get_time(12)
 
        ! 2) Evaluate p^T.A.p
 
        call wait_mpi  ! DEBUB
-       t1 = MPI_Wtime()  ! DEBUG
        pap = sum(p * ap, mask=rmask)
        call sum_mpi(pap)
 
@@ -904,18 +888,14 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
             aa(:, 1:noba_short, 1:nodetectors) + alpha * p
        r = r - alpha * ap
 
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "update_guess", MPI_Wtime() - t1  ! DEBUG
        ! 5) Precondition
 
        call wait_mpi  ! DEBUB
-       t1 = MPI_Wtime()  ! DEBUG
        call apply_preconditioner(z, r, istep)
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "precondition", MPI_Wtime() - t1  ! DEBUG
 
        ! 6) Check for convergence
 
        call wait_mpi  ! DEBUB
-       t1 = MPI_Wtime()  ! DEBUG
        rzo = rz
        rz = sum(r * z, mask=rmask)
        rz2 = sum(ro * z, mask=rmask)
@@ -929,9 +909,8 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca)
        ! This is the Polak-Ribiere formula that
        ! allows for updates to the preconditioner
        beta = (rz - rz2) / rzo
-       write(1000 + id, '(a,x,i3,x,a,x,f8.2)') "step", istep, "check_convergence", MPI_Wtime() - t1  ! DEBUG
 
-       if (ID==0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') &
+       if (ID == 0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') &
             istep, rr / rrinit, rz2 / rz, alpha, beta, get_time_and_reset(99)
 
        if (rr / rrinit > 1e3) call abort_mpi('CG is diverging')

From e14dbf5f2c194728dfd62adf92f8069bba1d3456 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Wed, 21 Aug 2019 10:21:50 -0700
Subject: [PATCH 07/17] initialize filter in a separate call

---
 src/noise_routines.f90 | 3 +--
 src/smadam.F90         | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 270a2e1..c2e9dfe 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -241,6 +241,7 @@ SUBROUTINE init_filter
     real(dp) :: t1
 
     if (.not. kfilter) return
+    if (nof >= 0) return  ! Filter already initialized
 
     nofilter = 2 * noba_short_pp_max
 
@@ -358,8 +359,6 @@ SUBROUTINE build_filter()
     if (info == 3 .and. ID == 0) write(*,*) 'Building filter...'
     if (info > 4) write(*,*) 'Building filter...'
 
-    if (nof < 0) call init_filter
-
     npsdtot = 0
     do idet = 1, nodetectors
        npsdtot = npsdtot + detectors(idet)%npsd
diff --git a/src/smadam.F90 b/src/smadam.F90
index 2a04be1..7c49527 100644
--- a/src/smadam.F90
+++ b/src/smadam.F90
@@ -254,6 +254,10 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, &
 
           cputime_init = cputime_init + get_time(1)
 
+          call tic
+          call init_filter
+          if (id == 0) call toc('init_filter')
+
           call tic
           call build_filter
           if (id == 0) call toc('build_filter')

From bca5532b981acd77c8be4faabbd9af0f082ed752 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Wed, 21 Aug 2019 12:29:51 -0700
Subject: [PATCH 08/17] use unaligned FFT in Madam

---
 src/commonparam.f90       |   1 +
 src/fourier_fftw_2003.f90 |   7 +-
 src/noise_routines.f90    | 194 ++++++++++++++++++++++----------------
 3 files changed, 120 insertions(+), 82 deletions(-)

diff --git a/src/commonparam.f90 b/src/commonparam.f90
index ba4f403..991e1f4 100644
--- a/src/commonparam.f90
+++ b/src/commonparam.f90
@@ -130,6 +130,7 @@ MODULE commonparam
   real(dp) :: dnshort=-1
   integer :: nlong=-1, nshort=-1
   logical :: kfirst=.true., kfilter=.false.
+  logical :: unaligned_fft=.true.
 
   real(dp) :: cglimit=1.d-12
   integer :: iter_min=3, iter_max=1000
diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90
index d8cb432..e99b37c 100644
--- a/src/fourier_fftw_2003.f90
+++ b/src/fourier_fftw_2003.f90
@@ -21,9 +21,10 @@ MODULE fourier
 
 CONTAINS
 
-  SUBROUTINE init_fourier(nof_in)
+  SUBROUTINE init_fourier(nof_in, unaligned)
 
     integer, intent(in) :: nof_in
+    logical, intent(in) :: unaligned
     integer :: fftw_planning_strategy
     real(C_DOUBLE), pointer :: in(:)
     complex(C_DOUBLE_COMPLEX), pointer :: out(:)
@@ -63,7 +64,9 @@ SUBROUTINE init_fourier(nof_in)
 
     fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT)
     !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_PRESERVE_INPUT)
-    !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_UNALIGNED)
+    if (unaligned) then
+       fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_UNALIGNED)
+    end if
 
     plan = fftw_plan_dft_r2c_1d(nof, in, out, fftw_planning_strategy)
     plan_inv = fftw_plan_dft_c2r_1d(nof, out, in, fftw_planning_strategy)
diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index c2e9dfe..0edd40c 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -43,6 +43,8 @@ MODULE noise_routines
 
   ! Pointers for FFT workspace
   type(C_PTR), allocatable :: pxx(:), pfx(:)
+  real(C_DOUBLE), allocatable, target :: xxthreads(:, :)
+  complex(C_DOUBLE_COMPLEX), allocatable, target :: fxthreads(:, :)
 
   ! PSD information
 
@@ -165,18 +167,18 @@ subroutine interpolate_psd(freq, psd, targetfreq, targetpsd)
        end if
        flog = log(targetfreq(i))
        if (j < n_in - 1) then
-          do while (logfreq(j+1) < flog)
+          do while (logfreq(j + 1) < flog)
              j = j + 1
              if (j == n_in - 1) exit
           end do
        end if
-       if (psd(j) > 0 .and. psd(j+1) > 0) then
+       if (psd(j) > 0 .and. psd(j + 1) > 0) then
           ! Logarithmic interpolation
-          r = (flog - logfreq(j)) / (logfreq(j+1) - logfreq(j))
+          r = (flog - logfreq(j)) / (logfreq(j + 1) - logfreq(j))
           targetpsd(i) = exp(logpsd(j) * (1._dp - r) + logpsd(j + 1) * r)
        else
           ! Linear interpolation
-          r = (targetfreq(i) - freq(j)) / (freq(j+1) - freq(j))
+          r = (targetfreq(i) - freq(j)) / (freq(j + 1) - freq(j))
           targetpsd(i) = psd(j) * (1._dp - r) + psd(j + 1) * r
        end if
     end do
@@ -223,7 +225,7 @@ function psd_index_det(idet, starttime) result(ipsd)
           if (detectors(idet)%psdstarts(ipsd) > starttime) ipsd = -1
           return
        end if
-       if (detectors(idet)%psdstarts(ipsd+1) > starttime) exit
+       if (detectors(idet)%psdstarts(ipsd + 1) > starttime) exit
        ipsd = ipsd + 1
     end do
 
@@ -243,6 +245,8 @@ SUBROUTINE init_filter
     if (.not. kfilter) return
     if (nof >= 0) return  ! Filter already initialized
 
+    if (info == 3 .and. ID == 0) write(*,*) 'Initializing filter...'
+
     nofilter = 2 * noba_short_pp_max
 
     nof_min = nofilter
@@ -256,23 +260,29 @@ SUBROUTINE init_filter
     end do
     if (ID == 0) write(*,*) 'FFT length =', nof
 
-    !allocate(fx(nof/2+1), xx(nof), stat=ierr) ! Work space
+    !allocate(fx(nof / 2 + 1), xx(nof), stat=ierr) ! Work space
     !if (ierr /= 0) call abort_mpi('No room for fx and xx')
 
     memory_filter = ((nof / 2 + 1) * 16. + nof * 8) * nthreads
 
-    call init_fourier(nof)
+    call init_fourier(nof, unaligned_fft)
 
     ! Permanently allocate space for FFT for each OpenMP thread
 
-    allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr)
-    if (ierr /= 0) stop 'Failed to allocate pxx and pfx'
-    pxx = C_NULL_PTR
-    pfx = C_NULL_PTR
-    do id_thread = 0, nthreads - 1
-       pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
-       pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
-    end do
+    if (unaligned_fft) then
+       allocate(xxthreads(nof, 0:nthreads - 1), &
+            fxthreads(nof / 2 + 1, 0:nthreads - 1), stat=ierr)
+       if (ierr /= 0) stop 'Failed to allocate xxthreads and fxthreads'
+    else
+       allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr)
+       if (ierr /= 0) stop 'Failed to allocate pxx and pfx'
+       pxx = C_NULL_PTR
+       pfx = C_NULL_PTR
+       do id_thread = 0, nthreads - 1
+          pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T))
+          pfx(id_thread) = fftw_alloc_complex(int(nof / 2 + 1, C_SIZE_T))
+       end do
+    end if
 
   END SUBROUTINE init_filter
 
@@ -329,11 +339,17 @@ SUBROUTINE close_filter
 
     call close_fourier
 
-    do id_thread = 0, nthreads - 1
-       call fftw_free(pxx(id_thread))
-       call fftw_free(pfx(id_thread))
-    end do
-    deallocate(pxx, pfx)
+    if (unaligned_fft) then
+       if (allocated(xxthreads)) deallocate(xxthreads, fxthreads)
+    else
+       if (allocated(pxx)) then
+          do id_thread = 0, nthreads - 1
+             call fftw_free(pxx(id_thread))
+             call fftw_free(pfx(id_thread))
+          end do
+          deallocate(pxx, pfx)
+       end if
+    end if
 
     nof = -1
 
@@ -366,13 +382,13 @@ SUBROUTINE build_filter()
 
     ! Construct and store the filters
     if (allocated(fcov)) deallocate(fcov)
-    allocate(fcov(nof/2+1, npsdtot), & ! Fourier inverse of the noise filter
+    allocate(fcov(nof / 2 + 1, npsdtot), & ! Fourier inverse of the noise filter
          psddet(npsdtot), psdind(npsdtot), psdstart(npsdtot), psdstop(npsdtot), &
          stat=ierr)
     if (ierr /= 0) call abort_mpi('No room for fcov')
-    memory_filter = memory_filter + (nof/2+1)*npsdtot*16. + npsdtot*24.
+    memory_filter = memory_filter + (nof / 2 + 1)*npsdtot*16. + npsdtot*24.
 
-    memsum = memory_filter / 2**20
+    memsum = memory_filter / 2 ** 20
     mem_min = memsum; mem_max = memsum
     call min_mpi(mem_min); call max_mpi(mem_max)
     call sum_mpi(memsum)
@@ -396,7 +412,7 @@ SUBROUTINE build_filter()
     id_thread = omp_get_thread_num()
     num_threads = omp_get_num_threads()
 
-    allocate(aspec(nof/2+1), spectrum(nof), f(nof), g(nof), stat=ierr)
+    allocate(aspec(nof / 2 + 1), spectrum(nof), f(nof), g(nof), stat=ierr)
     if (ierr /= 0) call abort_mpi('No room for aspec')
 
     ipsdtot = 0
@@ -415,7 +431,7 @@ SUBROUTINE build_filter()
           psdind(ipsdtot) = ipsd
           psdstart(ipsdtot) = detectors(idet)%psdstarts(ipsd)
           if (ipsd < detectors(idet)%npsd) then
-             psdstop(ipsdtot) = detectors(idet)%psdstarts(ipsd+1)
+             psdstop(ipsdtot) = detectors(idet)%psdstarts(ipsd + 1)
           else
              psdstop(ipsdtot) = 1e30
           end if
@@ -436,23 +452,23 @@ SUBROUTINE build_filter()
              end if
 
              do i = 1, nof
-                x = pi*f(i)/fa
+                x = pi * f(i) / fa
                 if (x < 1.e-30) then
-                   g(i) = 1.d0
+                   g(i) = 1
                 else
-                   g(i) = sin(x)**2/x**2
+                   g(i) = sin(x) ** 2 / x ** 2
                 end if
              end do
 
              if (k == 0) then
                 aspec(1) = spectrum(1)
              else
-                aspec(1) = aspec(1) + 2*spectrum(1)*g(1)
+                aspec(1) = aspec(1) + 2 * spectrum(1) * g(1)
              end if
 
-             do i = 2, nof/2+1
-                aspec(i) = aspec(i) + spectrum(i)*g(i) &
-                     + spectrum(nof-i+2)*g(nof-i+2)
+             do i = 2, nof / 2 + 1
+                aspec(i) = aspec(i) + spectrum(i) * g(i) &
+                     + spectrum(nof - i + 2) * g(nof - i + 2)
              end do
 
           end do
@@ -510,7 +526,7 @@ SUBROUTINE read_spectrum()
          end if
 
          allocate(ftable(nolines), spectrum_table(nolines, nodetectors), &
-              dtemp(nocols+1, nolines), stemp(nocols), sigmas(nocols), &
+              dtemp(nocols + 1, nolines), stemp(nocols), sigmas(nocols), &
               stat=ierr)
          if (ierr /= 0) call abort_mpi('No room to read the noise spectra')
 
@@ -540,7 +556,7 @@ SUBROUTINE read_spectrum()
          do idet = 1, nodetectors
             ! find the column of the read table that corresponds to
             ! detector # idet
-            do icol = 1, nocols+1
+            do icol = 1, nocols + 1
                if (icol > nocols) then
                   call abort_mpi('ERROR: ' // trim(detectors(idet)%name) // &
                        ' not in ' // trim(file_spectrum))
@@ -554,7 +570,7 @@ SUBROUTINE read_spectrum()
 
             ! prepare for logaritmic interpolation
             ! first column was the frequency
-            spectrum_table(:, idet) = log(dtemp(icol+1, :))
+            spectrum_table(:, idet) = log(dtemp(icol + 1, :))
          end do
 
          deallocate(stemp, dtemp)
@@ -564,7 +580,7 @@ SUBROUTINE read_spectrum()
       call broadcast_mpi(nolines, 0)
       do idet = 1, nodetectors
          call broadcast_mpi(detectors(idet)%sigmas, detectors(idet)%npsd, 0)
-         detectors(idet)%weights = 1 / detectors(idet)%sigmas**2
+         detectors(idet)%weights = 1 / detectors(idet)%sigmas ** 2
       end do
 
       if (id /= 0) then
@@ -600,16 +616,16 @@ SUBROUTINE get_spectrum_file()
          logf = log(f(i))
 
          do
-            if (logf <= ftable(k+1) .or. k == nolines-1) exit
+            if (logf <= ftable(k + 1) .or. k == nolines-1) exit
             k = k + 1
          end do
 
-         p = (logf-ftable(k)) / (ftable(k+1)-ftable(k))
+         p = (logf - ftable(k)) / (ftable(k + 1) - ftable(k))
 
          if (p < 0) p = 0
          if (p > 1) p = 1
 
-         logspec = (1.d0-p)*spectrum_table(k, icol) + p*spectrum_table(k+1, icol)
+         logspec = (1.d0 - p) * spectrum_table(k, icol) + p * spectrum_table(k + 1, icol)
          spectrum(i) = exp(logspec) ! *sigma*sigma/fsample  -RK
       end do
 
@@ -637,7 +653,7 @@ SUBROUTINE get_spectrum_interp(idet, ipsd, f, spectrum)
 
          ! the sigmas have already been updated from the PSDs
 
-         !plateau = detectors(idet)%sigmas(ipsd)**2 / fsample
+         !plateau = detectors(idet)%sigmas(ipsd) ** 2 / fsample
          plateau = detectors(idet)%plateaus(ipsd)
 
       else
@@ -695,7 +711,7 @@ SUBROUTINE get_spectrum_interp(idet, ipsd, f, spectrum)
          loop_bins : do i = 1, nof
             if (spectrum(i) <= 0) then
                ! First look for valid value in higher frequency
-               loop_next_bins: do j = i+1, nof
+               loop_next_bins: do j = i + 1, nof
                   if (spectrum(j) > 0) then
                      spectrum(i) = spectrum(j)
                      cycle loop_bins
@@ -755,7 +771,7 @@ subroutine trim_interval(kstart, kstop, noba, idet, nna)
          nna(0:basis_order, 0:basis_order, noba_short, nodetectors)
     integer :: gapmin, gaplen, i
 
-    if (all(nna(:, :, kstart+1:kstop, idet) == 0)) then
+    if (all(nna(:, :, kstart + 1:kstop, idet) == 0)) then
        noba = 0
        return
     end if
@@ -763,7 +779,7 @@ subroutine trim_interval(kstart, kstop, noba, idet, nna)
     ! trim the beginning
 
     do while (kstart < kstop)
-       if (any(nna(:, :, kstart+1, idet) /= 0)) exit
+       if (any(nna(:, :, kstart + 1, idet) /= 0)) exit
        kstart = kstart + 1
     end do
 
@@ -799,19 +815,19 @@ subroutine convolve_interval(ichunk, idet, x, y, xx, fx, fc, nna)
     integer, intent(in) :: ichunk, idet
     real(dp), intent(out) :: y(0:basis_order, noba_short, nodetectors)
     real(dp), intent(in) :: x(0:basis_order, noba_short, nodetectors)
-    complex(dp), intent(in) :: fc(nof/2 + 1, npsdtot)
+    complex(dp), intent(in) :: fc(nof / 2 + 1, npsdtot)
     real(dp), intent(in)  :: &
          nna(0:basis_order, 0:basis_order, noba_short, nodetectors)
 
     real(C_DOUBLE) :: xx(nof)
-    complex(C_DOUBLE_COMPLEX) :: fx(nof/2 + 1)
+    complex(C_DOUBLE_COMPLEX) :: fx(nof / 2 + 1)
     integer :: noba, kstart, kstop, ipsd
 
     kstart = sum(noba_short_pp(1:ichunk-1))
     kstop = kstart + noba_short_pp(ichunk)
-    ipsd = psd_index(idet, baselines_short_time(kstart+1))
+    ipsd = psd_index(idet, baselines_short_time(kstart + 1))
 
-    y(0, kstart+1:kstop, idet) = x(0, kstart+1:kstop, idet)
+    y(0, kstart + 1:kstop, idet) = x(0, kstart + 1:kstop, idet)
 
     if (ipsd == -1) then
        ! no PSD
@@ -824,14 +840,14 @@ subroutine convolve_interval(ichunk, idet, x, y, xx, fx, fc, nna)
        call trim_interval(kstart, kstop, noba, idet, nna)
        if (noba == 0) exit
 
-       xx(1:noba) = x(0, kstart+1:kstart+noba, idet)
+       xx(1:noba) = x(0, kstart + 1:kstart+noba, idet)
        xx(1:noba) = xx(1:noba) - sum(xx(1:noba)) / noba
-       xx(noba+1:) = 0
+       xx(noba + 1:) = 0
        call dfft(fx, xx)
        fx = fx * fc(:, ipsd)
        call dfftinv(xx, fx)
        xx(1:noba) = xx(1:noba) - sum(xx(1:noba)) / noba
-       y(0, kstart+1:kstart+noba, idet) = xx(1:noba)
+       y(0, kstart + 1:kstart+noba, idet) = xx(1:noba)
 
        kstart = kstart + noba
     end do
@@ -847,7 +863,7 @@ SUBROUTINE convolve_pp(y, x, fc, nna)
 
     real(dp), intent(out) :: y(noba_short, nodetectors)
     real(dp), intent(in) :: x(noba_short, nodetectors)
-    complex(dp), intent(in) :: fc(nof/2+1, npsdtot)
+    complex(dp), intent(in) :: fc(nof / 2 + 1, npsdtot)
     real(dp), intent(in)  :: &
          nna(0:basis_order, 0:basis_order, noba_short, nodetectors)
 
@@ -863,15 +879,21 @@ SUBROUTINE convolve_pp(y, x, fc, nna)
     !$OMP PARALLEL NUM_THREADS(nthreads) &
     !$OMP     DEFAULT(NONE) &
     !$OMP     SHARED(nof, nodetectors, ninterval, noba_short_pp, &
-    !$OMP         baselines_short_time, fc, x, y, nthreads, nna, pxx, pfx) &
+    !$OMP         baselines_short_time, fc, x, y, nthreads, nna, pxx, pfx, &
+    !$OMP         unaligned_fft, xxthreads, fxthreads) &
     !$OMP     PRIVATE(idet, ichunk, m, xx, fx, ierr, id_thread, itask, &
     !$OMP         num_threads)
 
     id_thread = omp_get_thread_num()
     num_threads = omp_get_num_threads()
 
-    call c_f_pointer(pxx(id_thread), xx, [nof])
-    call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1])
+    if (unaligned_fft) then
+       xx => xxthreads(:, id_thread)
+       fx => fxthreads(:, id_thread)
+    else
+       call c_f_pointer(pxx(id_thread), xx, [nof])
+       call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1])
+    end if
 
     itask = -1
     do idet = 1, nodetectors
@@ -929,15 +951,15 @@ SUBROUTINE construct_preconditioner(nna)
           do ichunk = 1, ninterval
              kstart = sum(noba_short_pp(1:ichunk-1))
              noba = noba_short_pp(ichunk)
-             ipsddet = psd_index_det(idet, baselines_short_time(kstart+1))
+             ipsddet = psd_index_det(idet, baselines_short_time(kstart + 1))
              if (ipsddet < 0) then
                 ! No PSD available
-                do k = kstart+1, kstart+noba
+                do k = kstart + 1, kstart+noba
                    prec_diag(k, idet) = 1.
                 end do
                 cycle
              end if
-             do k = kstart+1, kstart+noba
+             do k = kstart + 1, kstart+noba
                 if (nna(0, 0, k, idet) == 0) then
                    prec_diag(k, idet) = 1. / detectors(idet)%weights(ipsddet)
                 else
@@ -951,10 +973,10 @@ SUBROUTINE construct_preconditioner(nna)
 
     ! Build fprecond to act as a fall back option
     if (allocated(fprecond)) deallocate(fprecond)
-    allocate(fprecond(nof/2+1, npsdtot), stat=ierr)
+    allocate(fprecond(nof / 2 + 1, npsdtot), stat=ierr)
     if (ierr /= 0) call abort_mpi('No room for fprecond')
     fprecond = 0
-    memory_precond = memory_precond + (nof/2+1)*npsdtot*16.
+    memory_precond = memory_precond + (nof / 2 + 1)*npsdtot*16.
     ipsd = 0
     do idet = 1, nodetectors
        do ipsddet = 1, detectors(idet)%npsd
@@ -979,14 +1001,20 @@ SUBROUTINE construct_preconditioner(nna)
 
     !$OMP PARALLEL NUM_THREADS(nthreads) &
     !$OMP     DEFAULT(NONE) &
-    !$OMP     SHARED(nof, npsdtot, fcov, invcov, pxx, pfx) &
+    !$OMP     SHARED(nof, npsdtot, fcov, invcov, pxx, pfx, &
+    !$OMP         unaligned_fft, xxthreads, fxthreads) &
     !$OMP     PRIVATE(id_thread, num_threads, ipsd, xx, fx)
 
     id_thread = omp_get_thread_num()
     num_threads = omp_get_num_threads()
 
-    call c_f_pointer(pxx(id_thread), xx, [nof])
-    call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1])
+    if (unaligned_fft) then
+       xx => xxthreads(:, id_thread)
+       fx => fxthreads(:, id_thread)
+    else
+       call c_f_pointer(pxx(id_thread), xx, [nof])
+       call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1])
+    end if
 
     do ipsd = 1, npsdtot
        if (modulo(ipsd-1, num_threads) /= id_thread) cycle
@@ -1030,10 +1058,10 @@ SUBROUTINE construct_preconditioner(nna)
              loop_subchunk : do
                 call trim_interval(kstart, kstop, noba, idet, nna)
                 if (noba == 0) exit
-                ipsd = psd_index(idet, baselines_short_time(kstart+1))
+                ipsd = psd_index(idet, baselines_short_time(kstart + 1))
 
                 if (ipsd == -1 .or. noba < 1 .or. &
-                     all(nna(0, 0, kstart+1:kstart+noba, idet) == 0)) then
+                     all(nna(0, 0, kstart + 1:kstart+noba, idet) == 0)) then
                    nempty = nempty + 1
                    cycle
                 end if
@@ -1041,15 +1069,15 @@ SUBROUTINE construct_preconditioner(nna)
                 ! Rough guess for the width of the preconditioner based on
                 ! total power in the band
 
-                p_tot = sum(invcov(:nof/2, ipsd)**2)
+                p_tot = sum(invcov(:nof / 2, ipsd) ** 2)
                 try = 1
                 ierr = -1
                 loop_try : do
-                   ! Rows from kstart+1 to kstart+noba for one band-diagonal
+                   ! Rows from kstart + 1 to kstart+noba for one band-diagonal
                    ! submatrix.  Do the computation in double precision
                    nband = min(noba, try * precond_width_min)
                    nband = min(nband, precond_width_max)
-                   p = sum(invcov(:nband, ipsd)**2)
+                   p = sum(invcov(:nband, ipsd) ** 2)
                    if (p > plim * p_tot) then
                       ierr = 0
                       exit
@@ -1064,7 +1092,7 @@ SUBROUTINE construct_preconditioner(nna)
                    do
                       call get_cholesky_decomposition( &
                            bandprec(isub, ichunk, idet)%data, nband, noba, &
-                           invcov(1, ipsd), nna(0, 0, kstart+1, idet), ierr)
+                           invcov(1, ipsd), nna(0, 0, kstart + 1, idet), ierr)
 
                       if (ierr == 0) exit
 
@@ -1084,7 +1112,7 @@ SUBROUTINE construct_preconditioner(nna)
                    !write (*, '(1x,a,i0,a,es18.10,a,es18.10)') &
                    !     'Cholesky decomposition failed for ' // &
                    !     trim(detectors(idet)%name) // ', noba = ', noba, ', t = ', &
-                   !     baselines_short_time(kstart+1), ' - ', &
+                   !     baselines_short_time(kstart + 1), ' - ', &
                    !     baselines_short_time(kstart+noba)
                    nfail = nfail + 1
                 else
@@ -1114,7 +1142,7 @@ SUBROUTINE construct_preconditioner(nna)
 
     end if
 
-    memsum = memory_precond / 2**20
+    memsum = memory_precond / 2 ** 20
     mem_min = memsum; mem_max = memsum
     call min_mpi(mem_min); call max_mpi(mem_max)
     call sum_mpi(memsum)
@@ -1170,7 +1198,8 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
        !$OMP     DEFAULT(NONE) &
        !$OMP     SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, &
        !$OMP         id, nof, nshort, detectors, nthreads, fprecond, nna, &
-       !$OMP         baselines_short_time, invcov, fcov, checknan, pxx, pfx) &
+       !$OMP         baselines_short_time, invcov, fcov, checknan, pxx, pfx, &
+       !$OMP         unaligned_fft, xxthreads, fxthreads) &
        !$OMP     PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, &
        !$OMP         xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, &
        !$OMP         t1, t2, tf)
@@ -1180,8 +1209,13 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
        id_thread = omp_get_thread_num()
        num_threads = omp_get_num_threads()
 
-       call c_f_pointer(pxx(id_thread), xx, [nof])
-       call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1])
+       if (unaligned_fft) then
+          xx => xxthreads(:, id_thread)
+          fx => fxthreads(:, id_thread)
+       else
+          call c_f_pointer(pxx(id_thread), xx, [nof])
+          call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1])
+       end if
 
        itask = -1
        do idet = 1, nodetectors
@@ -1194,21 +1228,21 @@ SUBROUTINE preconditioning_band(z, r, nna, istep)
                 if (noba == 0) exit loop_subchunk
                 itask = itask + 1
                 if (modulo(itask, num_threads) == id_thread) then
-                   ipsd = psd_index_det(idet, baselines_short_time(kstart+1))
+                   ipsd = psd_index_det(idet, baselines_short_time(kstart + 1))
                    if (ipsd < 0) cycle
                    if (detectors(idet)%weights(ipsd) == 0) cycle
-                   ipsd = psd_index(idet, baselines_short_time(kstart+1))
+                   ipsd = psd_index(idet, baselines_short_time(kstart + 1))
                    if (allocated(bandprec(isub, ichunk, idet)%data)) then
                       ! Use the precomputed Cholesky decomposition
                       nband = size(bandprec(isub, ichunk, idet)%data, 1)
                       call apply_cholesky_decomposition( &
-                           noba, z(0, kstart+1, idet), r(0, kstart+1, idet), &
+                           noba, z(0, kstart + 1, idet), r(0, kstart + 1, idet), &
                            nband, bandprec(isub, ichunk, idet)%data)
                    else
                       ! Use CG iteration to apply the preconditioner
                       call apply_CG_preconditioner( &
-                           noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), &
-                           z(0, kstart+1, idet), r(0, kstart+1, idet), &
+                           noba, nof, fcov(1, ipsd), nna(0, 0, kstart + 1, idet), &
+                           z(0, kstart + 1, idet), r(0, kstart + 1, idet), &
                            invcov(1, ipsd), fx, xx, tf)
                    end if
                 end if
@@ -1279,12 +1313,12 @@ subroutine apply_CG_preconditioner( &
     ! Apply the preconditioner using CG iteration
     !
     integer, intent(in) :: noba, nof
-    complex(dp), intent(in) :: fcov(nof/2+1)
+    complex(dp), intent(in) :: fcov(nof / 2 + 1)
     real(dp), intent(in) :: nna(noba), invcov(noba)
     real(dp), intent(out) :: x(noba)
     real(dp), intent(in) :: b(noba)
     real(C_DOUBLE), intent(inout) :: xx(nof)
-    complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1)
+    complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof / 2 + 1)
     real(dp), intent(out) :: tfilter
 
     real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:)

From 4252a4a083a35571a354fdcebfaf4a45c31f8378 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Wed, 21 Aug 2019 13:12:46 -0700
Subject: [PATCH 09/17] add option to toggle unaligned FFT

---
 src/inputparam.f90        | 2 ++
 src/parameter_control.f90 | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/inputparam.f90 b/src/inputparam.f90
index 9e79008..90c0cbd 100644
--- a/src/inputparam.f90
+++ b/src/inputparam.f90
@@ -294,6 +294,8 @@ SUBROUTINE read_line(line)
        read(value, *, iostat=ierr) kfilter
     case ('diagfilter')
        read(value, *, iostat=ierr) diagfilter
+    case ('unaligned_fft')
+       read(value, *, iostat=ierr) unaligned_fft
     case ('precond_width_min')
        read(value, *, iostat=ierr) precond_width_min
     case ('precond_width_max')
diff --git a/src/parameter_control.f90 b/src/parameter_control.f90
index c1170e6..49acbfe 100644
--- a/src/parameter_control.f90
+++ b/src/parameter_control.f90
@@ -620,12 +620,13 @@ SUBROUTINE write_parameters()
 
     if (kfirst) then
        write (*,*)
-       write (*,fk) 'kfirst',kfirst, 'First destriping ON'
-       write (*,ff) 'dnshort',dnshort,'Baseline length (samples)'
+       write (*,fk) 'kfirst', kfirst, 'First destriping ON'
+       write (*,ff) 'dnshort', dnshort, 'Baseline length (samples)'
        write (*,ff) ' ',dnshort/fsample,'seconds'
 
        if (kfilter) then
-          write (*,fk) 'kfilter',kfilter,'Noise filter ON'
+          write (*,fk) 'kfilter', kfilter, 'Noise filter ON'
+          write (*,fk) 'unaligned_fft', unaligned_fft
        else
           write (*,fk) 'kfilter', kfilter, 'Noise filter OFF'
           write (*,fe) 'diagfilter', diagfilter, 'diagonal baseline filter'

From d3985b3dbe8bf95149a0442f8ed3bf25a2f4ac2e Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Thu, 22 Aug 2019 15:27:45 -0700
Subject: [PATCH 10/17] disable system wisdom loadimg

---
 src/fourier_fftw_2003.f90 | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90
index e99b37c..6942bff 100644
--- a/src/fourier_fftw_2003.f90
+++ b/src/fourier_fftw_2003.f90
@@ -46,15 +46,15 @@ SUBROUTINE init_fourier(nof_in, unaligned)
     call c_f_pointer(pin, in, [nof])
     call c_f_pointer(pout, out, [nof/2 + 1])
 
-    ierr = fftw_import_system_wisdom()
-
-    if (info /= 0) then
-       if (ierr /= 0) then
-          write (*,*) 'FFTW: System wide wisdom loaded'
-       else
-          write (*,*) 'FFTW: Unable to load system wisdom'
-       end if
-    end if
+!!$    ierr = fftw_import_system_wisdom()
+!!$
+!!$    if (info /= 0) then
+!!$       if (ierr /= 0) then
+!!$          write (*,*) 'FFTW: System wide wisdom loaded'
+!!$       else
+!!$          write (*,*) 'FFTW: Unable to load system wisdom'
+!!$       end if
+!!$    end if
 
     !if (nof <= 65536) then
     fftw_planning_strategy = FFTW_MEASURE

From f565341e49ea94f95e06964d90c775d36efb2e4e Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Fri, 30 Aug 2019 14:33:29 -0700
Subject: [PATCH 11/17] time filter initialization better

---
 src/fourier_fftw_2003.f90 |  4 ++--
 src/noise_routines.f90    | 20 ++++++++++++++------
 src/smadam.F90            |  9 +++++++++
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90
index 6942bff..6f9e363 100644
--- a/src/fourier_fftw_2003.f90
+++ b/src/fourier_fftw_2003.f90
@@ -57,9 +57,9 @@ SUBROUTINE init_fourier(nof_in, unaligned)
 !!$    end if
 
     !if (nof <= 65536) then
-    fftw_planning_strategy = FFTW_MEASURE
+    !fftw_planning_strategy = FFTW_MEASURE
     !else
-    !   fftw_planning_strategy = FFTW_ESTIMATE
+    fftw_planning_strategy = FFTW_ESTIMATE
     !end if
 
     fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT)
diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 0edd40c..8d0228f 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -19,10 +19,11 @@ MODULE noise_routines
 
   real(dp), save, public :: memory_filter = 0, memory_precond = 0, &
        cputime_filter = 0, cputime_precond = 0, &
-       cputime_prec_construct = 0
+       cputime_prec_construct = 0, &
+       cputime_filter_allocate = 0, &
+       cputime_filter_init = 0, &
+       cputime_filter_build = 0
 
-  !real(dp), allocatable :: xx(:)
-  !complex(dp), allocatable :: fx(:)
   complex(dp), allocatable, target :: fcov(:, :), fprecond(:, :)
   real(dp), allocatable :: invcov(:, :)
 
@@ -260,13 +261,14 @@ SUBROUTINE init_filter
     end do
     if (ID == 0) write(*,*) 'FFT length =', nof
 
-    !allocate(fx(nof / 2 + 1), xx(nof), stat=ierr) ! Work space
-    !if (ierr /= 0) call abort_mpi('No room for fx and xx')
-
     memory_filter = ((nof / 2 + 1) * 16. + nof * 8) * nthreads
 
+    call reset_time(16)
+
     call init_fourier(nof, unaligned_fft)
 
+    cputime_filter_init = cputime_filter_init + get_time_and_reset(16)
+
     ! Permanently allocate space for FFT for each OpenMP thread
 
     if (unaligned_fft) then
@@ -284,6 +286,8 @@ SUBROUTINE init_filter
        end do
     end if
 
+    cputime_filter_allocate = cputime_filter_allocate + get_time(16)
+
   END SUBROUTINE init_filter
 
 
@@ -375,6 +379,8 @@ SUBROUTINE build_filter()
     if (info == 3 .and. ID == 0) write(*,*) 'Building filter...'
     if (info > 4) write(*,*) 'Building filter...'
 
+    call reset_time(16)
+
     npsdtot = 0
     do idet = 1, nodetectors
        npsdtot = npsdtot + detectors(idet)%npsd
@@ -496,6 +502,8 @@ SUBROUTINE build_filter()
 
     if (kread_file) deallocate(ftable, spectrum_table)
 
+    cputime_filter_build = cputime_filter_build + get_time(16)
+
     if (info > 4) write(*,*) 'Done'
 
   CONTAINS
diff --git a/src/smadam.F90 b/src/smadam.F90
index 7c49527..038c807 100644
--- a/src/smadam.F90
+++ b/src/smadam.F90
@@ -499,6 +499,9 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, &
           call write_time('Binning TOD', cputime_bin_maps)
           call write_time('Sending binned TOD', cputime_send_maps)
           call write_time('Counting hits', cputime_count_hits)
+          call write_time('Filter - allocate', cputime_filter_allocate)
+          call write_time('Filter - initialize', cputime_filter_init)
+          call write_time('Filter - build', cputime_filter_build)
           call write_time('Building preconditioner', cputime_prec_construct)
           call write_time('Subtract/add baselines', &
                cputime_clean_tod+cputime_unclean_tod)
@@ -863,6 +866,9 @@ subroutine destripe_with_cache(comm, ndet, nsamp, nnz, &
        call write_time('Binning TOD', cputime_bin_maps)
        call write_time('Sending binned TOD', cputime_send_maps)
        call write_time('Counting hits', cputime_count_hits)
+       call write_time('Filter - allocate', cputime_filter_allocate)
+       call write_time('Filter - initialize', cputime_filter_init)
+       call write_time('Filter - build', cputime_filter_build)
        call write_time('Building preconditioner', cputime_prec_construct)
        call write_time('Subtract/add baselines', &
             cputime_clean_tod + cputime_unclean_tod)
@@ -1354,6 +1360,9 @@ subroutine reset_timers()
     cputime_send_maps = 0
     cputime_count_hits = 0
     cputime_prec_construct = 0
+    cputime_filter_allocate = 0
+    cputime_filter_init = 0
+    cputime_filter_build = 0
 
     cputime_cga_init = 0
     cputime_cga = 0

From e4d6ef193a39400d37178dc6f0a7e34f77d86e3a Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Fri, 30 Aug 2019 16:34:02 -0700
Subject: [PATCH 12/17] make memory reporting more helpful

---
 src/maps_and_baselines.f90 |  23 +-----
 src/maptod_transfer.f90    |  62 +++++++---------
 src/memory_and_time.f90    | 146 +++++++++++++++++++++++--------------
 src/noise_routines.f90     |  25 ++-----
 src/parameter_control.f90  |  12 +--
 src/pointing.f90           |  12 +--
 src/smadam.F90             |  24 +++---
 src/tod_storage.f90        |  11 +--
 8 files changed, 143 insertions(+), 172 deletions(-)

diff --git a/src/maps_and_baselines.f90 b/src/maps_and_baselines.f90
index 79e7019..1e68104 100644
--- a/src/maps_and_baselines.f90
+++ b/src/maps_and_baselines.f90
@@ -8,7 +8,7 @@ MODULE maps_and_baselines
        noba_short, nodetectors, do_binmap, do_hits, do_mask, &
        ndetset, nmap, nopix_cross, nopix_map, nsurvey, use_inmask
   use mpi_wrappers, only : min_mpi, max_mpi, sum_mpi
-  use memory_and_time, only : check_stat
+  use memory_and_time, only : check_stat, write_memory
 
   implicit none
   private
@@ -102,16 +102,7 @@ SUBROUTINE allocate_maps
        allocate(cca(nmap, nmap, 0:0), wamap(nmap, 0:0))
     endif
 
-    memsum = memory_maps / 2. ** 20.
-    mem_min = memsum
-    mem_max = memsum
-    call min_mpi(mem_min)
-    call max_mpi(mem_max)
-    call sum_mpi(memsum)
-
-    if (ID==0 .and. info > 0) then
-       write(*,mstr3) 'Allocated memory for maps:', memsum, mem_min, mem_max
-    end if
+    call write_memory("Map memory", memory_maps)
 
   END SUBROUTINE allocate_maps
 
@@ -171,15 +162,7 @@ SUBROUTINE allocate_baselines
        allocate(yba(1, 1, 1), nna(1, 1, 1, 1))
     endif
 
-    memsum = memory_baselines / 2**20
-    mem_min = memsum
-    mem_max = memsum
-    call min_mpi(mem_min)
-    call max_mpi(mem_max)
-    call sum_mpi(memsum)
-
-    if (ID==0 .and. info > 0) write(*,mstr3)  &
-         'Allocated memory for baselines:', memsum, mem_min, mem_max
+    call write_memory("Baseline memory", memory_baselines)
 
   END SUBROUTINE allocate_baselines
 
diff --git a/src/maptod_transfer.f90 b/src/maptod_transfer.f90
index 6085ff4..5234e36 100644
--- a/src/maptod_transfer.f90
+++ b/src/maptod_transfer.f90
@@ -2,6 +2,7 @@ MODULE maptod_transfer
 
   use commonparam
   use mpi_wrappers
+  use memory_and_time, only : write_memory
   implicit none
   private
 
@@ -54,31 +55,42 @@ SUBROUTINE update_maptod_transfer(ksubmap)
     logical, allocatable, intent(in) :: ksubmap(:)
     integer(i4b) :: ierr
     real(sp) :: memsum, mem_min, mem_max
+    integer :: nolocmaps_min, nolocmaps_max
+    real(dp) :: nolocmaps_sum
 
     nolocmaps = count(ksubmap(0:nosubmaps_tot-1))
     nolocpix = nolocmaps * nosubpix_max
 
+    nolocmaps_min = nolocmaps
+    nolocmaps_max = nolocmaps
+    nolocmaps_sum = nolocmaps
+    call min_mpi(nolocmaps_min)
+    call max_mpi(nolocmaps_max)
+    call sum_mpi(nolocmaps_sum)
+    if (ID == 0 .and. info > 0) then
+       write(*, '(x,"Total submaps = ",i0,"  submap size = ",i0)') &
+            nosubmaps_tot, nosubpix_max
+       write(*, '(x,"Local submaps:  min = ",i0,"  max = ",i0,"  mean = ",f8.2)') &
+            nolocmaps_min, nolocmaps_max, nolocmaps_sum / ntasks
+    end if
+
     if (.not. allocated(ksubmap_table)) then
        ! Default Fortran logical is 32 bits (!)
        if (allreduce) then
-          allocate(ksubmap_table(0:nosubmaps_tot-1, 0:0), stat=ierr)
+          allocate(ksubmap_table(0:nosubmaps_tot - 1, 0:0), stat=ierr)
           memory_ksubmap = nosubmaps_tot*4
        else
-          allocate(ksubmap_table(0:nosubmaps_tot-1, 0:ntasks-1), stat=ierr)
-          memory_ksubmap = nosubmaps_tot*ntasks*4
+          allocate(ksubmap_table(0:nosubmaps_tot - 1, 0:ntasks-1), stat=ierr)
+          memory_ksubmap = nosubmaps_tot * ntasks * 4
        end if
        if (ierr /= 0) call abort_mpi('No room for ksubmap_table')
        ksubmap_table = .false.
-       if (ID==0 .and. info > 0) then
-          memsum = memory_ksubmap / 2**20
-          write(*,mstr3) 'Allocated memory for submap table:', &
-               memsum, memsum, memsum
-       end if
+       call write_memory('Submap table memory', memory_ksubmap)
     end if
 
     if (allreduce) then
        ! All processes share ksubmap
-       ksubmap_table(:, 0) = ksubmap(0:nosubmaps_tot-1)
+       ksubmap_table(:, 0) = ksubmap(0:nosubmaps_tot - 1)
     else
        call mpi_allgather(ksubmap, nosubmaps_tot, MPI_LOGICAL, &
             ksubmap_table, nosubmaps_tot, MPI_LOGICAL, comm, ierr)
@@ -98,7 +110,7 @@ SUBROUTINE update_maptod_transfer(ksubmap)
 
        if (ierr /= 0) call abort_mpi('Failed to allocate locmap')
 
-       memory_locmap = (nsize_locmap+1) * (nmap*8.+nmap**2*8.+8)
+       memory_locmap = (nsize_locmap + 1) * (nmap * 8 + nmap ** 2 * 8 + 4)
 
     end if
 
@@ -107,17 +119,7 @@ SUBROUTINE update_maptod_transfer(ksubmap)
     locmask = 0
     lochits = 0
 
-    memsum = memory_locmap / 2**20
-    mem_min = memsum
-    mem_max = memsum
-    call min_mpi(mem_min)
-    call max_mpi(mem_max)
-    call sum_mpi(memsum)
-
-    if (ID==0 .and. info > 0) then
-       write(*,mstr3) 'Allocated memory for local maps:', &
-            memsum, mem_min, mem_max
-    end if
+    call write_memory('local maps memory', memory_locmap)
 
   END SUBROUTINE update_maptod_transfer
 
@@ -165,14 +167,7 @@ SUBROUTINE initialize_alltoallv()
                + recvcounts_gather(itask-1)
        end do
 
-       memsum = memory_all2all / 2**20
-       mem_min = memsum; mem_max = memsum
-       call min_mpi(mem_min); call max_mpi(mem_max)
-       call sum_mpi(memsum)
-       if (ID == 0 .and. info > 0) then
-          write(*,'(x,a,t32,3(f9.1," MB"))') &
-               'Allocated memory for allreduce:', memsum, mem_min, mem_max
-       end if
+       call write_memory("Allreduce memory", memory_all2all)
     end if
 
     ! allreduce implies concatenate_messages = .false.
@@ -293,14 +288,7 @@ SUBROUTINE initialize_alltoallv()
        offset = offset + nrecv
     end do
 
-    memsum = memory_all2all / 2**20
-    mem_min = memsum; mem_max = memsum
-    call min_mpi(mem_min); call max_mpi(mem_max)
-    call sum_mpi(memsum)
-    if (ID == 0 .and. info > 0) then
-       write(*,'(x,a,t32,3(f9.1," MB"))') &
-            'Allocated memory for all2allv:', memsum, mem_min, mem_max
-    end if
+    call write_memory("All2allv memory", memory_all2all)
 
   END SUBROUTINE initialize_alltoallv
 
diff --git a/src/memory_and_time.f90 b/src/memory_and_time.f90
index b08d6c2..6907c35 100644
--- a/src/memory_and_time.f90
+++ b/src/memory_and_time.f90
@@ -34,26 +34,57 @@ END SUBROUTINE check_stat
   !------------------------------------------------------------------------------
 
 
-  SUBROUTINE write_memory(text, memory_in)
-
-    character(len=*), intent(in) :: text
-    real(dp), intent(in), optional :: memory_in
-    real(dp) :: memory_sum
-
-    if (text=='Total') then
-       if (ID==0) then
-          write(*, '(x,a,t26,f9.1," MB")') text, memory_total
-          write(*, '(x,a,t26,f9.1," GB")') ' ',  memory_total/1024.
-       endif
+  function mem_to_string(mem) result(memstring)
+    ! Return a string representation of memory value
+    real(dp) :: mem  ! memory in MB
+    character(len=12) :: memstring
+    character(len=2) :: unit
+    
+    if (mem > 2 ** 30) then
+       mem = mem / 2 ** 30
+       unit = 'PB'
+    else if (mem > 2 ** 20) then
+       mem = mem / 2 ** 20
+       unit = 'TB'
+    else if (mem > 2 ** 10) then
+       mem = mem / 2 ** 10
+       unit = 'GB'
+    else if (mem > 1) then
+       unit = 'MB'
+    else if (mem > 1 / 2 ** 10) then
+       mem = mem * 2 ** 10
+       unit = 'kB'
     else
-       memory_sum = memory_in/1024./1024.
-       call sum_mpi(memory_sum)
+       mem = mem ** 2 ** 20
+       unit = 'B '
+    end if
+    write(memstring, '(f9.2,x,a2)') mem, unit
+  end function mem_to_string
 
-       memory_total = memory_total+memory_sum
 
-       if (ID==0.and.memory_sum.gt.0.05)  &
-            write(*, '(x,a,t26,f9.1)') text, memory_sum
-    endif
+  SUBROUTINE write_memory(text, memory_in)
+    character(len=*), intent(in) :: text  ! prefix the memory string
+    real(dp), intent(in), optional :: memory_in  ! memory in bytes
+    real(dp) :: memory, memory_sum, memory_min, memory_max
+
+    if (text == 'Total') then
+       memory = memory_total
+    else
+       memory = memory_in / 2 ** 20
+       memory_total = memory_total + memory
+    end if
+
+    memory_min = memory
+    memory_max = memory
+    memory_sum = memory
+    call min_mpi(memory_min)
+    call max_mpi(memory_max)
+    call sum_mpi(memory_sum)
+    if (ID == 0 .and. info > 0) then
+       write(*, '(x,a,t28,"min = ",a,"   max =",a,"   total =",a)') text, &
+            mem_to_string(memory_min), mem_to_string(memory_max), &
+            mem_to_string(memory_sum)
+    end if
 
   END SUBROUTINE write_memory
 
@@ -66,55 +97,58 @@ SUBROUTINE write_time(text, time)
     character(len=*), intent(in) :: text
     real(dp), intent(in), optional :: time
 
-    real(dp) :: time_min, time_max ! -RK
-
-    if (text=='- Other') then
-       time2 = time1-time2_sum
-
-       if (ID==0.and.time2.gt.0.1)    &
-            write(*,'(3x,a,t32,f7.1)') text, time2
-
-    elseif (text(1:1)=='-') then
-       time2 = time
-
-       time_min = time2; time_max = time2 ! -RK
-       call min_mpi(time_min); call max_mpi(time_max) ! -RK
-
+    real(dp) :: time_min, time_max
+
+    if (text(1:1) == '-') then
+       if (text == '- Other') then
+          ! Difference between top level timer and the sum of the sub timers
+          time2 = time1 - time2_sum
+       else
+          ! sub timer
+          time2 = time
+       end if
+       time_min = time2
+       time_max = time2
+       call min_mpi(time_min)
+       call max_mpi(time_max)
        call sum_mpi(time2)
-
-       time2 = time2/ntasks
-       time2_sum = time2_sum+time2
-
-       !if (ID==0.and.time2.gt.0.05) write(*,'(3x,a,t32,f8.1)') text, time2
-       if (ID==0.and.time2.gt.0.05) write(*,'(3x,a,t32,3f8.1)') & ! -RK
-            text, time2, time_min, time_max ! -RK
-
-    elseif (text=='Total') then
+       time2 = time2 / ntasks
+       time2_sum = time2_sum + time2
+
+       if (ID == 0 .and. time2 > 0.05) then
+          write (*,'(3x,a,t34,"mean =",f8.1,"  min =",f8.1,"  max =",f8.1)') &
+               text, time2, time_min, time_max
+       end if
+    else if (text == 'Total') then
+       ! zero level timer
        time1 = time
 
        call sum_mpi(time1)
-       time1 = time1/ntasks
-
-       if (ID==0) write(*,'(x,a,t28,f8.1," s  (",f8.2," CPU hours)")')  &
-            text, time1, time1*ntasks*nthreads_max/3600.
+       time1 = time1 / ntasks
 
+       if (ID == 0) then
+          write(*,'(x,a,t28,f8.1," s  (",f12.2," CPU hours)")') &
+               text, time1, time1 * ntasks * nthreads_max / 3600.
+       end if
     else
+       ! New top level timer, possibly followed by sub timers
        time1 = time
+       time_min = time1
+       time_max = time1
+       call min_mpi(time_min)
+       call max_mpi(time_max)
 
-       time_min = time1; time_max = time1 ! -RK
-       call min_mpi(time_min); call max_mpi(time_max) ! -RK
-
-       time_cum = time_cum+time1
+       time_cum = time_cum + time1
 
        call sum_mpi(time1)
-       time1 = time1/ntasks
-       time2_sum = 0.0
-
-       !if (ID==0.and.time1.gt.0.05) write (*,'(x,a,t28,f8.1)') text, time1 -RK
-       if (ID==0.and.time1.gt.0.05) write (*,'(x,a,t28,3f8.1)') &
-            text, time1, time_min, time_max
-
-    endif
+       time1 = time1 / ntasks
+       time2_sum = 0
+
+       if (ID == 0 .and. time1 > 0.05) then
+          write (*,'(x,a,t34,"mean =",f8.1,"  min =",f8.1,"  max =",f8.1)') &
+               text, time1, time_min, time_max
+       end if
+    end if
 
   END SUBROUTINE write_time
 
diff --git a/src/noise_routines.f90 b/src/noise_routines.f90
index 8d0228f..6769f2a 100644
--- a/src/noise_routines.f90
+++ b/src/noise_routines.f90
@@ -7,6 +7,7 @@ MODULE noise_routines
   use fourier
   use mpi_wrappers
   use timing
+  use memory_and_time, only : write_memory
 
   use tod_storage, only : sampletime
 
@@ -392,16 +393,9 @@ SUBROUTINE build_filter()
          psddet(npsdtot), psdind(npsdtot), psdstart(npsdtot), psdstop(npsdtot), &
          stat=ierr)
     if (ierr /= 0) call abort_mpi('No room for fcov')
-    memory_filter = memory_filter + (nof / 2 + 1)*npsdtot*16. + npsdtot*24.
-
-    memsum = memory_filter / 2 ** 20
-    mem_min = memsum; mem_max = memsum
-    call min_mpi(mem_min); call max_mpi(mem_max)
-    call sum_mpi(memsum)
-    if (ID == 0 .and. info > 0) then
-       write(*,'(1x,a,t32,3(f9.1," MB"))') &
-            'Allocated memory for filter:', memsum, mem_min, mem_max
-    end if
+    memory_filter = memory_filter + (nof / 2 + 1) * npsdtot * 16 + npsdtot * 24
+
+    call write_memory("Filter memory", memory_filter)
 
     kread_file = (len_trim(file_spectrum) > 0)
     if (kread_file) call read_spectrum
@@ -1142,7 +1136,7 @@ SUBROUTINE construct_preconditioner(nna)
           print *, 'Constructed ', sum(ntries), ' band preconditioners.'
           do try = 1, trymax
              if (ntries(try) > 0) print *, ntries(try), ' at width = ', &
-                  min(try*precond_width_min, precond_width_max)
+                  min(try * precond_width_min, precond_width_max)
           end do
           if (nfail > 0) print *, '            ', nfail, ' failed (using CG)'
           if (nempty > 0) print *, '    Skipped ', nempty, ' empty intervals.'
@@ -1150,14 +1144,7 @@ SUBROUTINE construct_preconditioner(nna)
 
     end if
 
-    memsum = memory_precond / 2 ** 20
-    mem_min = memsum; mem_max = memsum
-    call min_mpi(mem_min); call max_mpi(mem_max)
-    call sum_mpi(memsum)
-    if (ID == 0 .and. info > 0) then
-       write(*,'(1x,a,t32,3(f9.1," MB"))') &
-            'Allocated memory for precond:', memsum, mem_min, mem_max
-    end if
+    call write_memory("Precond memory", memory_precond)
 
     cputime_prec_construct = cputime_prec_construct + get_time(16)
 
diff --git a/src/parameter_control.f90 b/src/parameter_control.f90
index 49acbfe..4633140 100644
--- a/src/parameter_control.f90
+++ b/src/parameter_control.f90
@@ -8,6 +8,7 @@ MODULE parameter_control
   use pointing, only : subchunk
   use maps_and_baselines, only : memory_baselines, memory_maps, &
        memory_basis_functions
+  use memory_and_time, only : write_memory
 
   implicit none
   private
@@ -933,16 +934,7 @@ SUBROUTINE start_timeloop()
        end do
     end if ! if (kfirst)
 
-    memsum = memory_basis_functions / 2**20
-    mem_min = memsum
-    mem_max = memsum
-    call min_mpi(mem_min)
-    call max_mpi(mem_max)
-    call sum_mpi(memsum)
-    if (ID == 0 .and. info > 0) then
-       write(*,mstr3) 'Allocated memory for basis_functions:', &
-            memsum, mem_min, mem_max
-    end if
+    call write_memory("Basis function memory", memory_basis_functions)
 
     !end if
 
diff --git a/src/pointing.f90 b/src/pointing.f90
index acd2d37..4334c01 100644
--- a/src/pointing.f90
+++ b/src/pointing.f90
@@ -5,7 +5,7 @@ MODULE pointing
 
   use commonparam
   use mpi_wrappers
-  use memory_and_time, only : check_stat
+  use memory_and_time, only : check_stat, write_memory
 
   implicit none
   private
@@ -59,15 +59,7 @@ SUBROUTINE init_pointing
 
     dummy_pixel = 12 * nside_max ** 2
 
-    memory = memory_pointing / 2d0 ** 20
-
-    mem_min = memory; mem_max = memory
-    call min_mpi(mem_min)
-    call max_mpi(mem_max)
-    call sum_mpi(memory)
-
-    if (id == 0 .and. info > 0) write(*,'(a,t32,3(f12.1," MB"))')   &
-         ' Allocated memory for pointing:', memory, mem_min, mem_max
+    call write_memory("Pointing memory", memory_pointing)
 
   END SUBROUTINE init_pointing
 
diff --git a/src/smadam.F90 b/src/smadam.F90
index 038c807..201f980 100644
--- a/src/smadam.F90
+++ b/src/smadam.F90
@@ -457,6 +457,7 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, &
           if (id == 0) write(*,*)
           if (id == 0) write(*,*) 'MEMORY (MB):'
 
+          memory_total = 0
           call write_memory('Detector pointing', memory_pointing)
           call write_memory('TOD buffer', memory_tod)
           call write_memory('Maps', memory_maps)
@@ -824,17 +825,18 @@ subroutine destripe_with_cache(comm, ndet, nsamp, nnz, &
        if (id == 0) write(*,*)
        if (id == 0) write(*,*) 'MEMORY (MB):'
 
-       call write_memory('Detector pointing',  memory_pointing)
-       call write_memory('TOD buffer',         memory_tod)
-       call write_memory('Maps',               memory_maps)
-       call write_memory('Baselines',          memory_baselines)
-       call write_memory('Basis functions',    memory_basis_functions)
-       call write_memory('Noise filter',       memory_filter)
-       call write_memory('Preconditioner',     memory_precond)
-       call write_memory('Submap table',       memory_ksubmap)
-       call write_memory('Temporary maps',     memory_locmap)
-       call write_memory('All2All buffers',    memory_all2all)
-       call write_memory('CG work space',      memory_cg)
+       memory_total = 0
+       call write_memory('Detector pointing', memory_pointing)
+       call write_memory('TOD buffer', memory_tod)
+       call write_memory('Maps', memory_maps)
+       call write_memory('Baselines', memory_baselines)
+       call write_memory('Basis functions', memory_basis_functions)
+       call write_memory('Noise filter', memory_filter)
+       call write_memory('Preconditioner', memory_precond)
+       call write_memory('Submap table', memory_ksubmap)
+       call write_memory('Temporary maps', memory_locmap)
+       call write_memory('All2All buffers', memory_all2all)
+       call write_memory('CG work space', memory_cg)
        call write_memory('NCM', memory_ncm)
        call write_memory('Total')
     end if
diff --git a/src/tod_storage.f90 b/src/tod_storage.f90
index eae6107..a2acf95 100644
--- a/src/tod_storage.f90
+++ b/src/tod_storage.f90
@@ -6,7 +6,7 @@ MODULE tod_storage
 
   use commonparam
   use mpi_wrappers
-  use memory_and_time, only : check_stat
+  use memory_and_time, only : check_stat, write_memory
 
   implicit none
   private
@@ -43,14 +43,7 @@ SUBROUTINE allocate_tod
     memory_tod = memory_tod + nosamples_proc
     surveyflags = .true.
 
-    memsum = memory_tod/1024./1024.
-
-    mem_min = memsum; mem_max = memsum
-    call min_mpi(mem_min); call max_mpi(mem_max)
-    call sum_mpi(memsum)
-
-    if (id == 0 .and. info >= 1) &
-         write(*,mstr3) 'Allocated memory for TOD:', memsum, mem_min, mem_max
+    call write_memory("TOD memory", memory_tod)
 
   END SUBROUTINE allocate_tod
 

From 247db627178e574f4a08f905f9f893c251c40d63 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Sat, 31 Aug 2019 09:55:55 -0700
Subject: [PATCH 13/17] avoid overflow

---
 src/maptod_transfer.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/maptod_transfer.f90 b/src/maptod_transfer.f90
index 5234e36..9fafe56 100644
--- a/src/maptod_transfer.f90
+++ b/src/maptod_transfer.f90
@@ -110,7 +110,7 @@ SUBROUTINE update_maptod_transfer(ksubmap)
 
        if (ierr /= 0) call abort_mpi('Failed to allocate locmap')
 
-       memory_locmap = (nsize_locmap + 1) * (nmap * 8 + nmap ** 2 * 8 + 4)
+       memory_locmap = (nsize_locmap + 1.) * (nmap * 8. + nmap ** 2 * 8. + 4)
 
     end if
 

From c785d1a70fee891da72f148e7c8074c26d138b0b Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Sat, 31 Aug 2019 09:56:33 -0700
Subject: [PATCH 14/17] update dependencies

---
 src/Makefile.am | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/Makefile.am b/src/Makefile.am
index a320394..ae05abb 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -72,7 +72,8 @@ map_routines.lo : map_routines.f90 commonparam.lo matrix.lo mpi_wrappers.lo \
 submap_transfer.lo : submap_transfer.f90 commonparam.lo mpi_wrappers.lo
 
 parameter_control.lo : parameter_control.f90 commonparam.lo \
-	noise_routines.lo pointing.lo mpi_wrappers.lo maps_and_baselines.lo
+	noise_routines.lo pointing.lo mpi_wrappers.lo maps_and_baselines.lo \
+	memory_and_time.lo
 
 pointing.lo : pointing.f90 commonparam.lo mpi_wrappers.lo memory_and_time.lo
 
@@ -84,14 +85,16 @@ maps_and_baselines.lo : maps_and_baselines.f90 commonparam.lo mpi_wrappers.lo \
 tod_storage.lo : tod_storage.f90 commonparam.lo mpi_wrappers.lo memory_and_time.lo
 
 noise_routines.lo : noise_routines.f90 commonparam.lo fourier_fftw_2003.lo \
-	mpi_wrappers.lo timing.lo tod_storage.lo
+	mpi_wrappers.lo timing.lo tod_storage.lo memory_and_time.lo
 
 memory_and_time.lo : memory_and_time.f90 commonparam.lo mpi_wrappers.lo
 
 output.lo : output.f90 commonparam.lo submap_transfer.lo \
-	mpi_wrappers.lo timing.lo maps_and_baselines.lo tod_storage.lo fitsmod2.lo
+	mpi_wrappers.lo timing.lo maps_and_baselines.lo tod_storage.lo \
+	fitsmod2.lo
 
-maptod_transfer.lo : maptod_transfer.f90 commonparam.lo mpi_wrappers.lo
+maptod_transfer.lo : maptod_transfer.f90 commonparam.lo mpi_wrappers.lo \
+	memory_and_time.lo
 
 smadam_routines.lo : smadam_routines.f90 commonparam.lo mpi_wrappers.lo \
 	maptod_transfer.lo noise_routines.lo map_routines.lo \
@@ -115,7 +118,6 @@ smadam.lo : smadam.F90 commonparam.lo inputparam.lo \
 	covmat.lo covmat_util.lo
 
 
-
 libmadam_la_LIBADD = $(AM_LIBS)
 
 #libmadam_la_LDFLAGS = -Wl,--no-undefined

From 09e50cc5851f6dae7f5efe94acf269ff9bd1591f Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Tue, 3 Sep 2019 12:52:35 -0700
Subject: [PATCH 15/17] make the aligned FFT default again and use FFTW_MEASURE
 as the default planning strategy

---
 src/commonparam.f90       |  2 +-
 src/fourier_fftw_2003.f90 | 17 +++++++----------
 src/parameter_control.f90 | 11 +++++++----
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/commonparam.f90 b/src/commonparam.f90
index 991e1f4..8bdfd30 100644
--- a/src/commonparam.f90
+++ b/src/commonparam.f90
@@ -130,7 +130,7 @@ MODULE commonparam
   real(dp) :: dnshort=-1
   integer :: nlong=-1, nshort=-1
   logical :: kfirst=.true., kfilter=.false.
-  logical :: unaligned_fft=.true.
+  logical :: unaligned_fft=.false.
 
   real(dp) :: cglimit=1.d-12
   integer :: iter_min=3, iter_max=1000
diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90
index 6f9e363..714bc9c 100644
--- a/src/fourier_fftw_2003.f90
+++ b/src/fourier_fftw_2003.f90
@@ -37,15 +37,13 @@ SUBROUTINE init_fourier(nof_in, unaligned)
     nof = nof_in
     nofinv = 1 / dble(nof)
 
-    !allocate(in(nof), out(nof/2+1), stat=ierr)
-    !if (ierr /= 0) then
-    !   stop 'Unable to allocate working space for Fourier transform'
-    !end if
     pin = fftw_alloc_real(int(nof, C_SIZE_T))
     pout = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T))
     call c_f_pointer(pin, in, [nof])
     call c_f_pointer(pout, out, [nof/2 + 1])
 
+    ! It may put a strain on the filesystem to load system wisdom
+    ! from every process.
 !!$    ierr = fftw_import_system_wisdom()
 !!$
 !!$    if (info /= 0) then
@@ -56,11 +54,11 @@ SUBROUTINE init_fourier(nof_in, unaligned)
 !!$       end if
 !!$    end if
 
-    !if (nof <= 65536) then
-    !fftw_planning_strategy = FFTW_MEASURE
-    !else
-    fftw_planning_strategy = FFTW_ESTIMATE
-    !end if
+    if (nof <= 1000000) then
+       fftw_planning_strategy = FFTW_MEASURE
+    else
+       fftw_planning_strategy = FFTW_ESTIMATE
+    end if
 
     fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT)
     !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_PRESERVE_INPUT)
@@ -71,7 +69,6 @@ SUBROUTINE init_fourier(nof_in, unaligned)
     plan = fftw_plan_dft_r2c_1d(nof, in, out, fftw_planning_strategy)
     plan_inv = fftw_plan_dft_c2r_1d(nof, out, in, fftw_planning_strategy)
 
-    !deallocate(in, out)
     call fftw_free(pin)
     call fftw_free(pout)
 
diff --git a/src/parameter_control.f90 b/src/parameter_control.f90
index 4633140..ffda8e7 100644
--- a/src/parameter_control.f90
+++ b/src/parameter_control.f90
@@ -590,10 +590,13 @@ SUBROUTINE write_parameters()
     write (*,fi) 'nside_cross', nside_cross, 'Healpix resolution (destriping)'
     if (info > 1) &
          write (*,fi) 'nside_submap', nside_submap, 'Submap resolution'
-    write (*,fk) 'concatenate_messages', concatenate_messages, &
-         'use mpi_alltoallv to communicate'
-    write (*,fk) 'allreduce', allreduce, &
-         'use allreduce to communicate'
+    if (allreduce) then
+       write (*,fk) 'allreduce', allreduce, &
+            'use allreduce to communicate'
+    else
+       write (*,fk) 'concatenate_messages', concatenate_messages, &
+            'use mpi_alltoallv to communicate'
+    end if
     write (*,fk) 'reassign_submaps', reassign_submaps, &
          'minimize communication by reassigning submaps'
 

From 9fdbbae7b1ffb386b4ddddd5e59b6c906cc843ed Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Tue, 3 Sep 2019 12:53:07 -0700
Subject: [PATCH 16/17] always use FFTW_MEASURE as the planning strategy

---
 src/fourier_fftw_2003.f90 | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90
index 714bc9c..011e15f 100644
--- a/src/fourier_fftw_2003.f90
+++ b/src/fourier_fftw_2003.f90
@@ -54,11 +54,11 @@ SUBROUTINE init_fourier(nof_in, unaligned)
 !!$       end if
 !!$    end if
 
-    if (nof <= 1000000) then
-       fftw_planning_strategy = FFTW_MEASURE
-    else
-       fftw_planning_strategy = FFTW_ESTIMATE
-    end if
+    !if (nof <= 1000000) then
+    fftw_planning_strategy = FFTW_MEASURE
+    !else
+    !   fftw_planning_strategy = FFTW_ESTIMATE
+    !end if
 
     fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT)
     !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_PRESERVE_INPUT)

From 26b6500d3714749a1bf47de06b1ae49cf57b8377 Mon Sep 17 00:00:00 2001
From: Reijo Keskitalo <reijo.keskitalo@gmail.com>
Date: Tue, 24 Sep 2019 14:48:30 -0700
Subject: [PATCH 17/17] Move parameter defaults into a separate function

---
 src/commonparam.f90 | 233 +++++++++++++++++++++++++++++++-------------
 src/smadam.F90      |   2 +
 2 files changed, 168 insertions(+), 67 deletions(-)

diff --git a/src/commonparam.f90 b/src/commonparam.f90
index 8bdfd30..e3b433b 100644
--- a/src/commonparam.f90
+++ b/src/commonparam.f90
@@ -34,41 +34,38 @@ MODULE commonparam
      real(dp), allocatable :: plateaus(:)
   END TYPE detector_data
 
-  real(dp) :: fsample = 1 ! Sampling frequency
-  character(len=30) :: unit_tod = 'unknown'
+  real(dp) :: fsample ! Sampling frequency
+  character(len=30) :: unit_tod
 
   ! OpenMP
-  integer :: nthreads_max=1, nthreads=1, id_thread=0
+  integer :: nthreads_max, nthreads, id_thread
   integer, external :: omp_get_num_procs, omp_get_max_threads, &
        omp_get_thread_num, omp_get_num_threads
 
-  logical :: flag_by_horn=.false., force_pol=.false.
-  logical :: concatenate_messages = .true., allreduce = .false.
-  logical :: reassign_submaps = .true.
-  logical :: noise_weights_from_psd = .false. ! integrate noise weights internally
+  logical :: flag_by_horn, force_pol
+  logical :: concatenate_messages, allreduce, reassign_submaps
+  logical :: noise_weights_from_psd ! integrate noise weights internally
   ! Assume well-behaved noise spectrum without low pass filtering
-  logical :: radiometers = .true.
-  integer(i8b) :: psd_downsample=10
-  integer (i8b) :: psdlen=1e6
+  logical :: radiometers
+  integer(i8b) :: psd_downsample, psdlen
   ! Enable sub ring map making
-  integer(i2b) :: nsubchunk=0
-  integer(i2b) :: isubchunk=0
-  real(dp) :: fnoise_max=1000 ! When measuring noise variance, use this limit
+  integer(i2b) :: nsubchunk, isubchunk
+  real(dp) :: fnoise_max
   character(len=SLEN) :: file_profile = ''
   character(len=SLEN) :: file_intermediate_profile = ''
-  logical :: checknan=.false. ! Can cost time
-  real(dp) :: diagfilter=0
-  logical :: sync_output=.true., skip_existing=.false.
-  logical :: write_cut=.false.
-  logical :: tod_is_clean = .false.
-  logical :: binary_output=.false., concatenate_binary=.false.
+  logical :: checknan
+  real(dp) :: diagfilter
+  logical :: sync_output, skip_existing
+  logical :: write_cut
+  logical :: tod_is_clean
+  logical :: binary_output, concatenate_binary
   ! Used for concatenate_binary when storing multiple MC maps
-  integer :: record_number=1
+  integer :: record_number
   ! Number of independent groups of processes writing binary maps
-  integer(i4b) :: nwrite_binary = 10
+  integer(i4b) :: nwrite_binary
   integer(i4b), parameter :: basis_poly=1, basis_fourier=2, basis_cheby=3, &
        basis_legendre=4
-  integer(i4b) :: basis_func=basis_legendre, basis_order=0
+  integer(i4b) :: basis_func, basis_order
   type :: basis_function_type
      integer(i8b) :: nsamp
      logical :: copy
@@ -100,71 +97,68 @@ MODULE commonparam
   integer(i4b) :: nsurvey
   logical(byte), allocatable :: surveyflags(:)
 
-  logical :: bin_subsets = .false.
-  logical :: mcmode = .false., cached = .false.
+  logical :: bin_subsets
+  logical :: mcmode, cached
 
-  real(dp) :: good_baseline_fraction=0 ! default accepts all baselines
+  real(dp) :: good_baseline_fraction
   ! monte Carlo mode
-  integer(idp) :: mc_increment=1e7, mc_loops=1, mc_id=0, rng_base=0
-  logical :: incomplete_matrices = .false.
+  integer(idp) :: mc_increment, mc_loops, mc_id, rng_base
+  logical :: incomplete_matrices
 
-  integer :: ID = 0
-  integer :: ntasks = 1
+  integer :: ID
+  integer :: ntasks
 
   character(len=*), parameter :: version = '3.7'
   character(len=*), parameter :: idf = '(i4,": ",a,1x,i0,2x,i0)'
 
   ! Input parameters
-  integer :: info=2
+  integer :: info
 
-  integer :: nside_map=512, nside_cross=-1, nside_submap=16
+  integer :: nside_map, nside_cross, nside_submap
 
-  real(dp) :: time_unit=-5.d0
-  real(dp) :: mission_time=0.0
-  integer :: nofiles=-1
+  real(dp) :: time_unit
+  real(dp) :: mission_time
+  integer :: nofiles
 
-  integer :: pixmode_map=2, pixmode_cross=2
-  real(dp) :: pixlim_map=1e-6, pixlim_cross=1e-3
-  logical :: allow_decoupling = .false.
+  integer :: pixmode_map, pixmode_cross
+  real(dp) :: pixlim_map, pixlim_cross
+  logical :: allow_decoupling
 
-  real(dp) :: dnshort=-1
-  integer :: nlong=-1, nshort=-1
-  logical :: kfirst=.true., kfilter=.false.
-  logical :: unaligned_fft=.false.
+  real(dp) :: dnshort
+  integer :: nlong, nshort
+  logical :: kfirst, kfilter
+  logical :: unaligned_fft
 
-  real(dp) :: cglimit=1.d-12
-  integer :: iter_min=3, iter_max=1000
-  integer :: precond_width_min=10, precond_width_max=100
-  logical :: use_fprecond=.false., use_cgprecond=.false.
+  real(dp) :: cglimit
+  integer :: iter_min, iter_max
+  integer :: precond_width_min, precond_width_max
+  logical :: use_fprecond, use_cgprecond
 
-  integer :: mode_detweight=0
+  integer :: mode_detweight
 
-  logical :: rm_monopole=.false., temperature_only=.false.
+  logical :: rm_monopole, temperature_only
 
   ! Input files
-  character(len=SLEN) :: file_param='', &
-       file_inmask='', file_spectrum='', file_gap=''
+  character(len=SLEN) :: file_param, file_inmask, file_spectrum, file_gap
 
   ! Output files
-  character(len=SLEN) :: file_root='madam'
-  character(len=SLEN) :: file_map='', file_hit='', file_base=''
-  character(len=SLEN) :: file_matrix='', file_mask='', file_binmap=''
-  character(len=SLEN) :: file_wcov='', file_leakmatrix=''
-  character(len=SLEN) :: file_gap_out='', file_mc='',  path_output=''
+  character(len=SLEN) :: file_root, file_map, file_hit, file_base, &
+       file_matrix, file_mask, file_binmap, file_wcov, file_leakmatrix, &
+       file_gap_out, file_mc, path_output
 
-  logical :: write_tod=.false.
+  logical :: write_tod
 
   ! LFI specific keywords
-  character(len=80) :: instrument  = ''
+  character(len=80) :: instrument
 
   ! NCVM specific parameters
-  logical :: kwrite_covmat=.false.
-  character(len=SLEN) :: file_covmat = ''
+  logical :: kwrite_covmat
+  character(len=SLEN) :: file_covmat
 
   type(detector_data), allocatable, target :: detectors(:)
 
   ! Derived directly from input parameters
-  integer :: nmap=0, ncc=0, nside_max, nodetectors=-1
+  integer :: nmap, ncc, nside_max, nodetectors
 
   ! Pixels
   integer :: nopix_map, nopix_cross
@@ -186,7 +180,7 @@ MODULE commonparam
   real(i8b), allocatable :: baselines_short_time(:)
 
   ! Number of pointing periods and their duration as a number of samples
-  integer(i4b) :: ninterval = -1, ninterval_tot = -1
+  integer(i4b) :: ninterval, ninterval_tot
   integer(i8b), allocatable :: intervals(:)
   integer(i4b), allocatable :: interval_id(:)
 
@@ -196,14 +190,119 @@ MODULE commonparam
   integer(i4b), allocatable :: id_submap(:)
   integer(i4b) :: id_next, id_prev
 
-  logical :: do_map=.true., do_binmap=.false., do_hits=.false.
-  logical :: do_mask=.false., do_matrix=.false., do_wcov=.false.
-  logical :: do_base=.false., do_leakmatrix=.false.
-  logical :: do_wnmap=.false.
-  logical :: use_inmask
-
-  integer :: noiter = 0
+  logical :: do_map, do_binmap, do_hits, do_mask, do_matrix, do_wcov, do_base, &
+       do_leakmatrix, do_wnmap, use_inmask
+  integer :: noiter
 
   !--------------------------------------------------------------------------
 
+contains
+
+  subroutine set_parameter_defaults()
+    ! Set or restore commonparam values to defaults
+    fsample = 1
+    unit_tod = "unknown"
+    nthreads_max = 1
+    nthreads = 1
+    id_thread = 1
+    flag_by_horn = .false.
+    force_pol = .false.
+    concatenate_messages = .true.
+    allreduce = .false.
+    reassign_submaps = .true.
+    noise_weights_from_psd = .false.
+    radiometers = .true.
+    psd_downsample = 10
+    psdlen = 1e6
+    nsubchunk = 0
+    isubchunk = 0
+    fnoise_max = 1000  ! When measuring noise variance, use this limit
+    checknan = .false.  ! Can cost time
+    diagfilter = 0
+    sync_output = .true.
+    skip_existing = .false.
+    write_cut = .false.
+    tod_is_clean = .false.
+    binary_output = .false.
+    concatenate_binary = .false.
+    record_number = 1
+    nwrite_binary = 10
+    basis_func = basis_legendre
+    basis_order = 0
+    bin_subsets = .false.
+    mcmode = .false.
+    cached = .false.
+    good_baseline_fraction = 0  ! default accepts all baselines
+    mc_increment = 1e7
+    mc_loops = 1
+    mc_id = 0
+    rng_base = 0
+    incomplete_matrices = .false.
+    ID = 0
+    ntasks = 1
+    info = 2
+    nside_map = 512
+    nside_cross = -1
+    nside_submap = 16
+    time_unit = -5
+    mission_time = 0
+    nofiles = -1
+    pixmode_map = 2
+    pixmode_cross = 2
+    pixlim_map = 1e-6
+    pixlim_cross = 1e-3
+    allow_decoupling = .false.
+    dnshort = -1
+    nlong = -1
+    nshort = -1
+    kfirst = .true.
+    kfilter = .false.
+    unaligned_fft = .false.
+    cglimit = 1e-12
+    iter_min = 3
+    iter_max = 1000
+    precond_width_min = 10
+    precond_width_max = 100
+    use_fprecond = .false.
+    use_cgprecond = .false.
+    mode_detweight = 0
+    rm_monopole = .false.
+    temperature_only = .false.
+    file_param = ""
+    file_inmask = ""
+    file_spectrum = ""
+    file_gap = ""
+    file_root = "madam"
+    file_map = ""
+    file_hit = ""
+    file_base = ""
+    file_matrix = ""
+    file_mask = ""
+    file_binmap = ""
+    file_wcov = ""
+    file_leakmatrix = ""
+    file_gap_out = ""
+    file_mc  = ""
+    path_output = ""
+    write_tod = .false.
+    instrument  = ""
+    kwrite_covmat = .false.
+    file_covmat = ""
+    nmap = 0
+    ncc = 0
+    nodetectors = -1
+    ninterval = -1
+    ninterval_tot = -1
+    do_map = .true.
+    do_binmap = .false.
+    do_hits = .false.
+    do_mask = .false.
+    do_matrix = .false.
+    do_wcov = .false.
+    do_base = .false.
+    do_leakmatrix = .false.
+    do_wnmap = .false.
+    noiter = 0
+  end subroutine set_parameter_defaults
+
 END MODULE commonparam
diff --git a/src/smadam.F90 b/src/smadam.F90
index 201f980..38af411 100644
--- a/src/smadam.F90
+++ b/src/smadam.F90
@@ -104,6 +104,8 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, &
     integer(i2b) :: subchunkcounter
     integer(i8b) :: pixmin, pixmax
 
+    call set_parameter_defaults()
+
     ! set up MPI
 
     call init_mpi(comm, ntasks, id)