diff --git a/src/Makefile.am b/src/Makefile.am index a320394..ae05abb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -72,7 +72,8 @@ map_routines.lo : map_routines.f90 commonparam.lo matrix.lo mpi_wrappers.lo \ submap_transfer.lo : submap_transfer.f90 commonparam.lo mpi_wrappers.lo parameter_control.lo : parameter_control.f90 commonparam.lo \ - noise_routines.lo pointing.lo mpi_wrappers.lo maps_and_baselines.lo + noise_routines.lo pointing.lo mpi_wrappers.lo maps_and_baselines.lo \ + memory_and_time.lo pointing.lo : pointing.f90 commonparam.lo mpi_wrappers.lo memory_and_time.lo @@ -84,14 +85,16 @@ maps_and_baselines.lo : maps_and_baselines.f90 commonparam.lo mpi_wrappers.lo \ tod_storage.lo : tod_storage.f90 commonparam.lo mpi_wrappers.lo memory_and_time.lo noise_routines.lo : noise_routines.f90 commonparam.lo fourier_fftw_2003.lo \ - mpi_wrappers.lo timing.lo tod_storage.lo + mpi_wrappers.lo timing.lo tod_storage.lo memory_and_time.lo memory_and_time.lo : memory_and_time.f90 commonparam.lo mpi_wrappers.lo output.lo : output.f90 commonparam.lo submap_transfer.lo \ - mpi_wrappers.lo timing.lo maps_and_baselines.lo tod_storage.lo fitsmod2.lo + mpi_wrappers.lo timing.lo maps_and_baselines.lo tod_storage.lo \ + fitsmod2.lo -maptod_transfer.lo : maptod_transfer.f90 commonparam.lo mpi_wrappers.lo +maptod_transfer.lo : maptod_transfer.f90 commonparam.lo mpi_wrappers.lo \ + memory_and_time.lo smadam_routines.lo : smadam_routines.f90 commonparam.lo mpi_wrappers.lo \ maptod_transfer.lo noise_routines.lo map_routines.lo \ @@ -115,7 +118,6 @@ smadam.lo : smadam.F90 commonparam.lo inputparam.lo \ covmat.lo covmat_util.lo - libmadam_la_LIBADD = $(AM_LIBS) #libmadam_la_LDFLAGS = -Wl,--no-undefined diff --git a/src/commonparam.f90 b/src/commonparam.f90 index ba4f403..e3b433b 100644 --- a/src/commonparam.f90 +++ b/src/commonparam.f90 @@ -34,41 +34,38 @@ MODULE commonparam real(dp), allocatable :: plateaus(:) END TYPE detector_data - real(dp) :: fsample = 1 ! Sampling frequency - character(len=30) :: unit_tod = 'unknown' + real(dp) :: fsample ! Sampling frequency + character(len=30) :: unit_tod ! OpenMP - integer :: nthreads_max=1, nthreads=1, id_thread=0 + integer :: nthreads_max, nthreads, id_thread integer, external :: omp_get_num_procs, omp_get_max_threads, & omp_get_thread_num, omp_get_num_threads - logical :: flag_by_horn=.false., force_pol=.false. - logical :: concatenate_messages = .true., allreduce = .false. - logical :: reassign_submaps = .true. - logical :: noise_weights_from_psd = .false. ! integrate noise weights internally + logical :: flag_by_horn, force_pol + logical :: concatenate_messages, allreduce, reassign_submaps + logical :: noise_weights_from_psd ! integrate noise weights internally ! Assume well-behaved noise spectrum without low pass filtering - logical :: radiometers = .true. - integer(i8b) :: psd_downsample=10 - integer (i8b) :: psdlen=1e6 + logical :: radiometers + integer(i8b) :: psd_downsample, psdlen ! Enable sub ring map making - integer(i2b) :: nsubchunk=0 - integer(i2b) :: isubchunk=0 - real(dp) :: fnoise_max=1000 ! When measuring noise variance, use this limit + integer(i2b) :: nsubchunk, isubchunk + real(dp) :: fnoise_max character(len=SLEN) :: file_profile = '' character(len=SLEN) :: file_intermediate_profile = '' - logical :: checknan=.false. ! Can cost time - real(dp) :: diagfilter=0 - logical :: sync_output=.true., skip_existing=.false. - logical :: write_cut=.false. - logical :: tod_is_clean = .false. - logical :: binary_output=.false., concatenate_binary=.false. + logical :: checknan + real(dp) :: diagfilter + logical :: sync_output, skip_existing + logical :: write_cut + logical :: tod_is_clean + logical :: binary_output, concatenate_binary ! Used for concatenate_binary when storing multiple MC maps - integer :: record_number=1 + integer :: record_number ! Number of independent groups of processes writing binary maps - integer(i4b) :: nwrite_binary = 10 + integer(i4b) :: nwrite_binary integer(i4b), parameter :: basis_poly=1, basis_fourier=2, basis_cheby=3, & basis_legendre=4 - integer(i4b) :: basis_func=basis_legendre, basis_order=0 + integer(i4b) :: basis_func, basis_order type :: basis_function_type integer(i8b) :: nsamp logical :: copy @@ -100,70 +97,68 @@ MODULE commonparam integer(i4b) :: nsurvey logical(byte), allocatable :: surveyflags(:) - logical :: bin_subsets = .false. - logical :: mcmode = .false., cached = .false. + logical :: bin_subsets + logical :: mcmode, cached - real(dp) :: good_baseline_fraction=0 ! default accepts all baselines + real(dp) :: good_baseline_fraction ! monte Carlo mode - integer(idp) :: mc_increment=1e7, mc_loops=1, mc_id=0, rng_base=0 - logical :: incomplete_matrices = .false. + integer(idp) :: mc_increment, mc_loops, mc_id, rng_base + logical :: incomplete_matrices - integer :: ID = 0 - integer :: ntasks = 1 + integer :: ID + integer :: ntasks character(len=*), parameter :: version = '3.7' character(len=*), parameter :: idf = '(i4,": ",a,1x,i0,2x,i0)' ! Input parameters - integer :: info=2 + integer :: info - integer :: nside_map=512, nside_cross=-1, nside_submap=16 + integer :: nside_map, nside_cross, nside_submap - real(dp) :: time_unit=-5.d0 - real(dp) :: mission_time=0.0 - integer :: nofiles=-1 + real(dp) :: time_unit + real(dp) :: mission_time + integer :: nofiles - integer :: pixmode_map=2, pixmode_cross=2 - real(dp) :: pixlim_map=1e-6, pixlim_cross=1e-3 - logical :: allow_decoupling = .false. + integer :: pixmode_map, pixmode_cross + real(dp) :: pixlim_map, pixlim_cross + logical :: allow_decoupling - real(dp) :: dnshort=-1 - integer :: nlong=-1, nshort=-1 - logical :: kfirst=.true., kfilter=.false. + real(dp) :: dnshort + integer :: nlong, nshort + logical :: kfirst, kfilter + logical :: unaligned_fft - real(dp) :: cglimit=1.d-12 - integer :: iter_min=3, iter_max=1000 - integer :: precond_width_min=10, precond_width_max=100 - logical :: use_fprecond=.false., use_cgprecond=.false. + real(dp) :: cglimit + integer :: iter_min, iter_max + integer :: precond_width_min, precond_width_max + logical :: use_fprecond, use_cgprecond - integer :: mode_detweight=0 + integer :: mode_detweight - logical :: rm_monopole=.false., temperature_only=.false. + logical :: rm_monopole, temperature_only ! Input files - character(len=SLEN) :: file_param='', & - file_inmask='', file_spectrum='', file_gap='' + character(len=SLEN) :: file_param, file_inmask, file_spectrum, file_gap ! Output files - character(len=SLEN) :: file_root='madam' - character(len=SLEN) :: file_map='', file_hit='', file_base='' - character(len=SLEN) :: file_matrix='', file_mask='', file_binmap='' - character(len=SLEN) :: file_wcov='', file_leakmatrix='' - character(len=SLEN) :: file_gap_out='', file_mc='', path_output='' + character(len=SLEN) :: file_root, file_map, file_hit, file_base, & + file_matrix, file_mask, file_binmap, file_wcov, file_leakmatrix, & + file_gap_out, file_mc, path_output - logical :: write_tod=.false. + logical :: write_tod ! LFI specific keywords - character(len=80) :: instrument = '' + character(len=80) :: instrument ! NCVM specific parameters - logical :: kwrite_covmat=.false. - character(len=SLEN) :: file_covmat = '' + logical :: kwrite_covmat + character(len=SLEN) :: file_covmat type(detector_data), allocatable, target :: detectors(:) ! Derived directly from input parameters - integer :: nmap=0, ncc=0, nside_max, nodetectors=-1 + integer :: nmap, ncc, nside_max, nodetectors ! Pixels integer :: nopix_map, nopix_cross @@ -185,7 +180,7 @@ MODULE commonparam real(i8b), allocatable :: baselines_short_time(:) ! Number of pointing periods and their duration as a number of samples - integer(i4b) :: ninterval = -1, ninterval_tot = -1 + integer(i4b) :: ninterval, ninterval_tot integer(i8b), allocatable :: intervals(:) integer(i4b), allocatable :: interval_id(:) @@ -195,14 +190,119 @@ MODULE commonparam integer(i4b), allocatable :: id_submap(:) integer(i4b) :: id_next, id_prev - logical :: do_map=.true., do_binmap=.false., do_hits=.false. - logical :: do_mask=.false., do_matrix=.false., do_wcov=.false. - logical :: do_base=.false., do_leakmatrix=.false. - logical :: do_wnmap=.false. - logical :: use_inmask - - integer :: noiter = 0 + logical :: do_map, do_binmap, do_hits, do_mask, do_matrix, do_wcov, do_base, & + do_leakmatrix, do_wnmap, use_inmask + integer :: noiter !-------------------------------------------------------------------------- +contains + + subroutine set_parameter_defaults() + ! Set or restore commonparam values to defaults + fsample = 1 + unit_tod = "unknown" + nthreads_max = 1 + nthreads = 1 + id_thread = 1 + flag_by_horn = .false. + force_pol = .false. + concatenate_messages = .true. + allreduce = .false. + reassign_submaps = .true. + noise_weights_from_psd = .false. + radiometers = .true. + psd_downsample = 10 + psdlen = 1e6 + nsubchunk = 0 + isubchunk = 0 + fnoise_max = 1000 ! When measuring noise variance, use this limit + checknan = .false. ! Can cost time + diagfilter = 0 + sync_output = .true. + skip_existing = .false. + write_cut = .false. + tod_is_clean = .false. + binary_output = .false. + concatenate_binary = .false. + record_number = 1 + nwrite_binary = 10 + basis_func = basis_legendre + basis_order = 0 + bin_subsets = .false. + mcmode = .false. + cached = .false. + good_baseline_fraction = 0 ! default accepts all baselines + mc_increment = 1e7 + mc_loops = 1 + mc_id = 0 + rng_base = 0 + incomplete_matrices = .false. + ID = 0 + ntasks = 1 + info = 2 + nside_map = 512 + nside_cross = -1 + nside_submap = 16 + time_unit = -5 + mission_time = 0 + nofiles = -1 + pixmode_map = 2 + pixmode_cross = 2 + pixlim_map = 1e-6 + pixlim_cross = 1e-3 + allow_decoupling = .false. + dnshort = -1 + nlong = -1 + nshort = -1 + kfirst = .true. + kfilter = .false. + unaligned_fft = .false. + cglimit = 1e-12 + iter_min = 3 + iter_max = 1000 + precond_width_min = 10 + precond_width_max = 100 + use_fprecond = .false. + use_cgprecond = .false. + mode_detweight = 0 + rm_monopole = .false. + temperature_only = .false. + file_param = "" + file_inmask = "" + file_spectrum = "" + file_gap = "" + file_root = "madam" + file_map = "" + file_hit = "" + file_base = "" + file_matrix = "" + file_mask = "" + file_binmap = "" + file_wcov = "" + file_leakmatrix = "" + file_gap_out = "" + file_mc = "" + path_output = "" + write_tod = .false. + instrument = "" + kwrite_covmat = .false. + file_covmat = "" + nmap = 0 + ncc = 0 + nodetectors = -1 + ninterval = -1 + ninterval_tot = -1 + do_map = .true. + do_binmap = .false. + do_hits = .false. + do_mask = .false. + do_matrix = .false. + do_wcov = .false. + do_base = .false. + do_leakmatrix = .false. + do_wnmap = .false. + noiter = 0 + end subroutine set_parameter_defaults + END MODULE commonparam diff --git a/src/fourier_fftw_2003.f90 b/src/fourier_fftw_2003.f90 index d8cb432..011e15f 100644 --- a/src/fourier_fftw_2003.f90 +++ b/src/fourier_fftw_2003.f90 @@ -21,9 +21,10 @@ MODULE fourier CONTAINS - SUBROUTINE init_fourier(nof_in) + SUBROUTINE init_fourier(nof_in, unaligned) integer, intent(in) :: nof_in + logical, intent(in) :: unaligned integer :: fftw_planning_strategy real(C_DOUBLE), pointer :: in(:) complex(C_DOUBLE_COMPLEX), pointer :: out(:) @@ -36,26 +37,24 @@ SUBROUTINE init_fourier(nof_in) nof = nof_in nofinv = 1 / dble(nof) - !allocate(in(nof), out(nof/2+1), stat=ierr) - !if (ierr /= 0) then - ! stop 'Unable to allocate working space for Fourier transform' - !end if pin = fftw_alloc_real(int(nof, C_SIZE_T)) pout = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) call c_f_pointer(pin, in, [nof]) call c_f_pointer(pout, out, [nof/2 + 1]) - ierr = fftw_import_system_wisdom() - - if (info /= 0) then - if (ierr /= 0) then - write (*,*) 'FFTW: System wide wisdom loaded' - else - write (*,*) 'FFTW: Unable to load system wisdom' - end if - end if - - !if (nof <= 65536) then + ! It may put a strain on the filesystem to load system wisdom + ! from every process. +!!$ ierr = fftw_import_system_wisdom() +!!$ +!!$ if (info /= 0) then +!!$ if (ierr /= 0) then +!!$ write (*,*) 'FFTW: System wide wisdom loaded' +!!$ else +!!$ write (*,*) 'FFTW: Unable to load system wisdom' +!!$ end if +!!$ end if + + !if (nof <= 1000000) then fftw_planning_strategy = FFTW_MEASURE !else ! fftw_planning_strategy = FFTW_ESTIMATE @@ -63,12 +62,13 @@ SUBROUTINE init_fourier(nof_in) fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_DESTROY_INPUT) !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_PRESERVE_INPUT) - !fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_UNALIGNED) + if (unaligned) then + fftw_planning_strategy = ior(fftw_planning_strategy, FFTW_UNALIGNED) + end if plan = fftw_plan_dft_r2c_1d(nof, in, out, fftw_planning_strategy) plan_inv = fftw_plan_dft_c2r_1d(nof, out, in, fftw_planning_strategy) - !deallocate(in, out) call fftw_free(pin) call fftw_free(pout) diff --git a/src/inputparam.f90 b/src/inputparam.f90 index 9e79008..90c0cbd 100644 --- a/src/inputparam.f90 +++ b/src/inputparam.f90 @@ -294,6 +294,8 @@ SUBROUTINE read_line(line) read(value, *, iostat=ierr) kfilter case ('diagfilter') read(value, *, iostat=ierr) diagfilter + case ('unaligned_fft') + read(value, *, iostat=ierr) unaligned_fft case ('precond_width_min') read(value, *, iostat=ierr) precond_width_min case ('precond_width_max') diff --git a/src/maps_and_baselines.f90 b/src/maps_and_baselines.f90 index 79e7019..1e68104 100644 --- a/src/maps_and_baselines.f90 +++ b/src/maps_and_baselines.f90 @@ -8,7 +8,7 @@ MODULE maps_and_baselines noba_short, nodetectors, do_binmap, do_hits, do_mask, & ndetset, nmap, nopix_cross, nopix_map, nsurvey, use_inmask use mpi_wrappers, only : min_mpi, max_mpi, sum_mpi - use memory_and_time, only : check_stat + use memory_and_time, only : check_stat, write_memory implicit none private @@ -102,16 +102,7 @@ SUBROUTINE allocate_maps allocate(cca(nmap, nmap, 0:0), wamap(nmap, 0:0)) endif - memsum = memory_maps / 2. ** 20. - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - - if (ID==0 .and. info > 0) then - write(*,mstr3) 'Allocated memory for maps:', memsum, mem_min, mem_max - end if + call write_memory("Map memory", memory_maps) END SUBROUTINE allocate_maps @@ -171,15 +162,7 @@ SUBROUTINE allocate_baselines allocate(yba(1, 1, 1), nna(1, 1, 1, 1)) endif - memsum = memory_baselines / 2**20 - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - - if (ID==0 .and. info > 0) write(*,mstr3) & - 'Allocated memory for baselines:', memsum, mem_min, mem_max + call write_memory("Baseline memory", memory_baselines) END SUBROUTINE allocate_baselines diff --git a/src/maptod_transfer.f90 b/src/maptod_transfer.f90 index 6085ff4..9fafe56 100644 --- a/src/maptod_transfer.f90 +++ b/src/maptod_transfer.f90 @@ -2,6 +2,7 @@ MODULE maptod_transfer use commonparam use mpi_wrappers + use memory_and_time, only : write_memory implicit none private @@ -54,31 +55,42 @@ SUBROUTINE update_maptod_transfer(ksubmap) logical, allocatable, intent(in) :: ksubmap(:) integer(i4b) :: ierr real(sp) :: memsum, mem_min, mem_max + integer :: nolocmaps_min, nolocmaps_max + real(dp) :: nolocmaps_sum nolocmaps = count(ksubmap(0:nosubmaps_tot-1)) nolocpix = nolocmaps * nosubpix_max + nolocmaps_min = nolocmaps + nolocmaps_max = nolocmaps + nolocmaps_sum = nolocmaps + call min_mpi(nolocmaps_min) + call max_mpi(nolocmaps_max) + call sum_mpi(nolocmaps_sum) + if (ID == 0 .and. info > 0) then + write(*, '(x,"Total submaps = ",i0," submap size = ",i0)') & + nosubmaps_tot, nosubpix_max + write(*, '(x,"Local submaps: min = ",i0," max = ",i0," mean = ",f8.2)') & + nolocmaps_min, nolocmaps_max, nolocmaps_sum / ntasks + end if + if (.not. allocated(ksubmap_table)) then ! Default Fortran logical is 32 bits (!) if (allreduce) then - allocate(ksubmap_table(0:nosubmaps_tot-1, 0:0), stat=ierr) + allocate(ksubmap_table(0:nosubmaps_tot - 1, 0:0), stat=ierr) memory_ksubmap = nosubmaps_tot*4 else - allocate(ksubmap_table(0:nosubmaps_tot-1, 0:ntasks-1), stat=ierr) - memory_ksubmap = nosubmaps_tot*ntasks*4 + allocate(ksubmap_table(0:nosubmaps_tot - 1, 0:ntasks-1), stat=ierr) + memory_ksubmap = nosubmaps_tot * ntasks * 4 end if if (ierr /= 0) call abort_mpi('No room for ksubmap_table') ksubmap_table = .false. - if (ID==0 .and. info > 0) then - memsum = memory_ksubmap / 2**20 - write(*,mstr3) 'Allocated memory for submap table:', & - memsum, memsum, memsum - end if + call write_memory('Submap table memory', memory_ksubmap) end if if (allreduce) then ! All processes share ksubmap - ksubmap_table(:, 0) = ksubmap(0:nosubmaps_tot-1) + ksubmap_table(:, 0) = ksubmap(0:nosubmaps_tot - 1) else call mpi_allgather(ksubmap, nosubmaps_tot, MPI_LOGICAL, & ksubmap_table, nosubmaps_tot, MPI_LOGICAL, comm, ierr) @@ -98,7 +110,7 @@ SUBROUTINE update_maptod_transfer(ksubmap) if (ierr /= 0) call abort_mpi('Failed to allocate locmap') - memory_locmap = (nsize_locmap+1) * (nmap*8.+nmap**2*8.+8) + memory_locmap = (nsize_locmap + 1.) * (nmap * 8. + nmap ** 2 * 8. + 4) end if @@ -107,17 +119,7 @@ SUBROUTINE update_maptod_transfer(ksubmap) locmask = 0 lochits = 0 - memsum = memory_locmap / 2**20 - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - - if (ID==0 .and. info > 0) then - write(*,mstr3) 'Allocated memory for local maps:', & - memsum, mem_min, mem_max - end if + call write_memory('local maps memory', memory_locmap) END SUBROUTINE update_maptod_transfer @@ -165,14 +167,7 @@ SUBROUTINE initialize_alltoallv() + recvcounts_gather(itask-1) end do - memsum = memory_all2all / 2**20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for allreduce:', memsum, mem_min, mem_max - end if + call write_memory("Allreduce memory", memory_all2all) end if ! allreduce implies concatenate_messages = .false. @@ -293,14 +288,7 @@ SUBROUTINE initialize_alltoallv() offset = offset + nrecv end do - memsum = memory_all2all / 2**20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for all2allv:', memsum, mem_min, mem_max - end if + call write_memory("All2allv memory", memory_all2all) END SUBROUTINE initialize_alltoallv diff --git a/src/memory_and_time.f90 b/src/memory_and_time.f90 index b08d6c2..6907c35 100644 --- a/src/memory_and_time.f90 +++ b/src/memory_and_time.f90 @@ -34,26 +34,57 @@ END SUBROUTINE check_stat !------------------------------------------------------------------------------ - SUBROUTINE write_memory(text, memory_in) - - character(len=*), intent(in) :: text - real(dp), intent(in), optional :: memory_in - real(dp) :: memory_sum - - if (text=='Total') then - if (ID==0) then - write(*, '(x,a,t26,f9.1," MB")') text, memory_total - write(*, '(x,a,t26,f9.1," GB")') ' ', memory_total/1024. - endif + function mem_to_string(mem) result(memstring) + ! Return a string representation of memory value + real(dp) :: mem ! memory in MB + character(len=12) :: memstring + character(len=2) :: unit + + if (mem > 2 ** 30) then + mem = mem / 2 ** 30 + unit = 'PB' + else if (mem > 2 ** 20) then + mem = mem / 2 ** 20 + unit = 'TB' + else if (mem > 2 ** 10) then + mem = mem / 2 ** 10 + unit = 'GB' + else if (mem > 1) then + unit = 'MB' + else if (mem > 1 / 2 ** 10) then + mem = mem * 2 ** 10 + unit = 'kB' else - memory_sum = memory_in/1024./1024. - call sum_mpi(memory_sum) + mem = mem ** 2 ** 20 + unit = 'B ' + end if + write(memstring, '(f9.2,x,a2)') mem, unit + end function mem_to_string - memory_total = memory_total+memory_sum - if (ID==0.and.memory_sum.gt.0.05) & - write(*, '(x,a,t26,f9.1)') text, memory_sum - endif + SUBROUTINE write_memory(text, memory_in) + character(len=*), intent(in) :: text ! prefix the memory string + real(dp), intent(in), optional :: memory_in ! memory in bytes + real(dp) :: memory, memory_sum, memory_min, memory_max + + if (text == 'Total') then + memory = memory_total + else + memory = memory_in / 2 ** 20 + memory_total = memory_total + memory + end if + + memory_min = memory + memory_max = memory + memory_sum = memory + call min_mpi(memory_min) + call max_mpi(memory_max) + call sum_mpi(memory_sum) + if (ID == 0 .and. info > 0) then + write(*, '(x,a,t28,"min = ",a," max =",a," total =",a)') text, & + mem_to_string(memory_min), mem_to_string(memory_max), & + mem_to_string(memory_sum) + end if END SUBROUTINE write_memory @@ -66,55 +97,58 @@ SUBROUTINE write_time(text, time) character(len=*), intent(in) :: text real(dp), intent(in), optional :: time - real(dp) :: time_min, time_max ! -RK - - if (text=='- Other') then - time2 = time1-time2_sum - - if (ID==0.and.time2.gt.0.1) & - write(*,'(3x,a,t32,f7.1)') text, time2 - - elseif (text(1:1)=='-') then - time2 = time - - time_min = time2; time_max = time2 ! -RK - call min_mpi(time_min); call max_mpi(time_max) ! -RK - + real(dp) :: time_min, time_max + + if (text(1:1) == '-') then + if (text == '- Other') then + ! Difference between top level timer and the sum of the sub timers + time2 = time1 - time2_sum + else + ! sub timer + time2 = time + end if + time_min = time2 + time_max = time2 + call min_mpi(time_min) + call max_mpi(time_max) call sum_mpi(time2) - - time2 = time2/ntasks - time2_sum = time2_sum+time2 - - !if (ID==0.and.time2.gt.0.05) write(*,'(3x,a,t32,f8.1)') text, time2 - if (ID==0.and.time2.gt.0.05) write(*,'(3x,a,t32,3f8.1)') & ! -RK - text, time2, time_min, time_max ! -RK - - elseif (text=='Total') then + time2 = time2 / ntasks + time2_sum = time2_sum + time2 + + if (ID == 0 .and. time2 > 0.05) then + write (*,'(3x,a,t34,"mean =",f8.1," min =",f8.1," max =",f8.1)') & + text, time2, time_min, time_max + end if + else if (text == 'Total') then + ! zero level timer time1 = time call sum_mpi(time1) - time1 = time1/ntasks - - if (ID==0) write(*,'(x,a,t28,f8.1," s (",f8.2," CPU hours)")') & - text, time1, time1*ntasks*nthreads_max/3600. + time1 = time1 / ntasks + if (ID == 0) then + write(*,'(x,a,t28,f8.1," s (",f12.2," CPU hours)")') & + text, time1, time1 * ntasks * nthreads_max / 3600. + end if else + ! New top level timer, possibly followed by sub timers time1 = time + time_min = time1 + time_max = time1 + call min_mpi(time_min) + call max_mpi(time_max) - time_min = time1; time_max = time1 ! -RK - call min_mpi(time_min); call max_mpi(time_max) ! -RK - - time_cum = time_cum+time1 + time_cum = time_cum + time1 call sum_mpi(time1) - time1 = time1/ntasks - time2_sum = 0.0 - - !if (ID==0.and.time1.gt.0.05) write (*,'(x,a,t28,f8.1)') text, time1 -RK - if (ID==0.and.time1.gt.0.05) write (*,'(x,a,t28,3f8.1)') & - text, time1, time_min, time_max - - endif + time1 = time1 / ntasks + time2_sum = 0 + + if (ID == 0 .and. time1 > 0.05) then + write (*,'(x,a,t34,"mean =",f8.1," min =",f8.1," max =",f8.1)') & + text, time1, time_min, time_max + end if + end if END SUBROUTINE write_time diff --git a/src/noise_routines.f90 b/src/noise_routines.f90 index 8e81426..6769f2a 100644 --- a/src/noise_routines.f90 +++ b/src/noise_routines.f90 @@ -7,6 +7,7 @@ MODULE noise_routines use fourier use mpi_wrappers use timing + use memory_and_time, only : write_memory use tod_storage, only : sampletime @@ -19,10 +20,11 @@ MODULE noise_routines real(dp), save, public :: memory_filter = 0, memory_precond = 0, & cputime_filter = 0, cputime_precond = 0, & - cputime_prec_construct = 0 + cputime_prec_construct = 0, & + cputime_filter_allocate = 0, & + cputime_filter_init = 0, & + cputime_filter_build = 0 - !real(dp), allocatable :: xx(:) - !complex(dp), allocatable :: fx(:) complex(dp), allocatable, target :: fcov(:, :), fprecond(:, :) real(dp), allocatable :: invcov(:, :) @@ -41,6 +43,11 @@ MODULE noise_routines type(bandprec_pointer), allocatable :: bandprec(:, :, :) real(dp), allocatable, target :: prec_diag(:, :) + ! Pointers for FFT workspace + type(C_PTR), allocatable :: pxx(:), pfx(:) + real(C_DOUBLE), allocatable, target :: xxthreads(:, :) + complex(C_DOUBLE_COMPLEX), allocatable, target :: fxthreads(:, :) + ! PSD information integer :: npsdtot @@ -162,18 +169,18 @@ subroutine interpolate_psd(freq, psd, targetfreq, targetpsd) end if flog = log(targetfreq(i)) if (j < n_in - 1) then - do while (logfreq(j+1) < flog) + do while (logfreq(j + 1) < flog) j = j + 1 if (j == n_in - 1) exit end do end if - if (psd(j) > 0 .and. psd(j+1) > 0) then + if (psd(j) > 0 .and. psd(j + 1) > 0) then ! Logarithmic interpolation - r = (flog - logfreq(j)) / (logfreq(j+1) - logfreq(j)) + r = (flog - logfreq(j)) / (logfreq(j + 1) - logfreq(j)) targetpsd(i) = exp(logpsd(j) * (1._dp - r) + logpsd(j + 1) * r) else ! Linear interpolation - r = (targetfreq(i) - freq(j)) / (freq(j+1) - freq(j)) + r = (targetfreq(i) - freq(j)) / (freq(j + 1) - freq(j)) targetpsd(i) = psd(j) * (1._dp - r) + psd(j + 1) * r end if end do @@ -220,7 +227,7 @@ function psd_index_det(idet, starttime) result(ipsd) if (detectors(idet)%psdstarts(ipsd) > starttime) ipsd = -1 return end if - if (detectors(idet)%psdstarts(ipsd+1) > starttime) exit + if (detectors(idet)%psdstarts(ipsd + 1) > starttime) exit ipsd = ipsd + 1 end do @@ -235,8 +242,12 @@ end function psd_index_det SUBROUTINE init_filter integer :: nof_min, ierr + real(dp) :: t1 if (.not. kfilter) return + if (nof >= 0) return ! Filter already initialized + + if (info == 3 .and. ID == 0) write(*,*) 'Initializing filter...' nofilter = 2 * noba_short_pp_max @@ -247,16 +258,36 @@ SUBROUTINE init_filter nof = 1 do if (nof >= nof_min) exit - nof = 2*nof + nof = 2 * nof end do if (ID == 0) write(*,*) 'FFT length =', nof - !allocate(fx(nof/2+1), xx(nof), stat=ierr) ! Work space - !if (ierr /= 0) call abort_mpi('No room for fx and xx') + memory_filter = ((nof / 2 + 1) * 16. + nof * 8) * nthreads + + call reset_time(16) + + call init_fourier(nof, unaligned_fft) + + cputime_filter_init = cputime_filter_init + get_time_and_reset(16) - memory_filter = ((nof/2+1)*16. + nof*8) * nthreads + ! Permanently allocate space for FFT for each OpenMP thread + + if (unaligned_fft) then + allocate(xxthreads(nof, 0:nthreads - 1), & + fxthreads(nof / 2 + 1, 0:nthreads - 1), stat=ierr) + if (ierr /= 0) stop 'Failed to allocate xxthreads and fxthreads' + else + allocate(pxx(0:nthreads - 1), pfx(0:nthreads - 1), stat=ierr) + if (ierr /= 0) stop 'Failed to allocate pxx and pfx' + pxx = C_NULL_PTR + pfx = C_NULL_PTR + do id_thread = 0, nthreads - 1 + pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) + pfx(id_thread) = fftw_alloc_complex(int(nof / 2 + 1, C_SIZE_T)) + end do + end if - call init_fourier(nof) + cputime_filter_allocate = cputime_filter_allocate + get_time(16) END SUBROUTINE init_filter @@ -295,6 +326,8 @@ end subroutine free_bandprec SUBROUTINE close_filter + real(dp) :: t1 + if (.not. kfilter) return if (allocated(fcov)) deallocate(fcov) @@ -311,6 +344,18 @@ SUBROUTINE close_filter call close_fourier + if (unaligned_fft) then + if (allocated(xxthreads)) deallocate(xxthreads, fxthreads) + else + if (allocated(pxx)) then + do id_thread = 0, nthreads - 1 + call fftw_free(pxx(id_thread)) + call fftw_free(pfx(id_thread)) + end do + deallocate(pxx, pfx) + end if + end if + nof = -1 END SUBROUTINE close_filter @@ -335,7 +380,7 @@ SUBROUTINE build_filter() if (info == 3 .and. ID == 0) write(*,*) 'Building filter...' if (info > 4) write(*,*) 'Building filter...' - if (nof < 0) call init_filter + call reset_time(16) npsdtot = 0 do idet = 1, nodetectors @@ -344,20 +389,13 @@ SUBROUTINE build_filter() ! Construct and store the filters if (allocated(fcov)) deallocate(fcov) - allocate(fcov(nof/2+1, npsdtot), & ! Fourier inverse of the noise filter + allocate(fcov(nof / 2 + 1, npsdtot), & ! Fourier inverse of the noise filter psddet(npsdtot), psdind(npsdtot), psdstart(npsdtot), psdstop(npsdtot), & stat=ierr) if (ierr /= 0) call abort_mpi('No room for fcov') - memory_filter = memory_filter + (nof/2+1)*npsdtot*16. + npsdtot*24. - - memsum = memory_filter / 2**20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(1x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for filter:', memsum, mem_min, mem_max - end if + memory_filter = memory_filter + (nof / 2 + 1) * npsdtot * 16 + npsdtot * 24 + + call write_memory("Filter memory", memory_filter) kread_file = (len_trim(file_spectrum) > 0) if (kread_file) call read_spectrum @@ -374,7 +412,7 @@ SUBROUTINE build_filter() id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - allocate(aspec(nof/2+1), spectrum(nof), f(nof), g(nof), stat=ierr) + allocate(aspec(nof / 2 + 1), spectrum(nof), f(nof), g(nof), stat=ierr) if (ierr /= 0) call abort_mpi('No room for aspec') ipsdtot = 0 @@ -393,7 +431,7 @@ SUBROUTINE build_filter() psdind(ipsdtot) = ipsd psdstart(ipsdtot) = detectors(idet)%psdstarts(ipsd) if (ipsd < detectors(idet)%npsd) then - psdstop(ipsdtot) = detectors(idet)%psdstarts(ipsd+1) + psdstop(ipsdtot) = detectors(idet)%psdstarts(ipsd + 1) else psdstop(ipsdtot) = 1e30 end if @@ -414,23 +452,23 @@ SUBROUTINE build_filter() end if do i = 1, nof - x = pi*f(i)/fa + x = pi * f(i) / fa if (x < 1.e-30) then - g(i) = 1.d0 + g(i) = 1 else - g(i) = sin(x)**2/x**2 + g(i) = sin(x) ** 2 / x ** 2 end if end do if (k == 0) then aspec(1) = spectrum(1) else - aspec(1) = aspec(1) + 2*spectrum(1)*g(1) + aspec(1) = aspec(1) + 2 * spectrum(1) * g(1) end if - do i = 2, nof/2+1 - aspec(i) = aspec(i) + spectrum(i)*g(i) & - + spectrum(nof-i+2)*g(nof-i+2) + do i = 2, nof / 2 + 1 + aspec(i) = aspec(i) + spectrum(i) * g(i) & + + spectrum(nof - i + 2) * g(nof - i + 2) end do end do @@ -458,6 +496,8 @@ SUBROUTINE build_filter() if (kread_file) deallocate(ftable, spectrum_table) + cputime_filter_build = cputime_filter_build + get_time(16) + if (info > 4) write(*,*) 'Done' CONTAINS @@ -488,7 +528,7 @@ SUBROUTINE read_spectrum() end if allocate(ftable(nolines), spectrum_table(nolines, nodetectors), & - dtemp(nocols+1, nolines), stemp(nocols), sigmas(nocols), & + dtemp(nocols + 1, nolines), stemp(nocols), sigmas(nocols), & stat=ierr) if (ierr /= 0) call abort_mpi('No room to read the noise spectra') @@ -518,7 +558,7 @@ SUBROUTINE read_spectrum() do idet = 1, nodetectors ! find the column of the read table that corresponds to ! detector # idet - do icol = 1, nocols+1 + do icol = 1, nocols + 1 if (icol > nocols) then call abort_mpi('ERROR: ' // trim(detectors(idet)%name) // & ' not in ' // trim(file_spectrum)) @@ -532,7 +572,7 @@ SUBROUTINE read_spectrum() ! prepare for logaritmic interpolation ! first column was the frequency - spectrum_table(:, idet) = log(dtemp(icol+1, :)) + spectrum_table(:, idet) = log(dtemp(icol + 1, :)) end do deallocate(stemp, dtemp) @@ -542,7 +582,7 @@ SUBROUTINE read_spectrum() call broadcast_mpi(nolines, 0) do idet = 1, nodetectors call broadcast_mpi(detectors(idet)%sigmas, detectors(idet)%npsd, 0) - detectors(idet)%weights = 1 / detectors(idet)%sigmas**2 + detectors(idet)%weights = 1 / detectors(idet)%sigmas ** 2 end do if (id /= 0) then @@ -578,16 +618,16 @@ SUBROUTINE get_spectrum_file() logf = log(f(i)) do - if (logf <= ftable(k+1) .or. k == nolines-1) exit + if (logf <= ftable(k + 1) .or. k == nolines-1) exit k = k + 1 end do - p = (logf-ftable(k)) / (ftable(k+1)-ftable(k)) + p = (logf - ftable(k)) / (ftable(k + 1) - ftable(k)) if (p < 0) p = 0 if (p > 1) p = 1 - logspec = (1.d0-p)*spectrum_table(k, icol) + p*spectrum_table(k+1, icol) + logspec = (1.d0 - p) * spectrum_table(k, icol) + p * spectrum_table(k + 1, icol) spectrum(i) = exp(logspec) ! *sigma*sigma/fsample -RK end do @@ -615,7 +655,7 @@ SUBROUTINE get_spectrum_interp(idet, ipsd, f, spectrum) ! the sigmas have already been updated from the PSDs - !plateau = detectors(idet)%sigmas(ipsd)**2 / fsample + !plateau = detectors(idet)%sigmas(ipsd) ** 2 / fsample plateau = detectors(idet)%plateaus(ipsd) else @@ -673,7 +713,7 @@ SUBROUTINE get_spectrum_interp(idet, ipsd, f, spectrum) loop_bins : do i = 1, nof if (spectrum(i) <= 0) then ! First look for valid value in higher frequency - loop_next_bins: do j = i+1, nof + loop_next_bins: do j = i + 1, nof if (spectrum(j) > 0) then spectrum(i) = spectrum(j) cycle loop_bins @@ -733,7 +773,7 @@ subroutine trim_interval(kstart, kstop, noba, idet, nna) nna(0:basis_order, 0:basis_order, noba_short, nodetectors) integer :: gapmin, gaplen, i - if (all(nna(:, :, kstart+1:kstop, idet) == 0)) then + if (all(nna(:, :, kstart + 1:kstop, idet) == 0)) then noba = 0 return end if @@ -741,7 +781,7 @@ subroutine trim_interval(kstart, kstop, noba, idet, nna) ! trim the beginning do while (kstart < kstop) - if (any(nna(:, :, kstart+1, idet) /= 0)) exit + if (any(nna(:, :, kstart + 1, idet) /= 0)) exit kstart = kstart + 1 end do @@ -777,19 +817,19 @@ subroutine convolve_interval(ichunk, idet, x, y, xx, fx, fc, nna) integer, intent(in) :: ichunk, idet real(dp), intent(out) :: y(0:basis_order, noba_short, nodetectors) real(dp), intent(in) :: x(0:basis_order, noba_short, nodetectors) - complex(dp), intent(in) :: fc(nof/2 + 1, npsdtot) + complex(dp), intent(in) :: fc(nof / 2 + 1, npsdtot) real(dp), intent(in) :: & nna(0:basis_order, 0:basis_order, noba_short, nodetectors) real(C_DOUBLE) :: xx(nof) - complex(C_DOUBLE_COMPLEX) :: fx(nof/2 + 1) + complex(C_DOUBLE_COMPLEX) :: fx(nof / 2 + 1) integer :: noba, kstart, kstop, ipsd kstart = sum(noba_short_pp(1:ichunk-1)) kstop = kstart + noba_short_pp(ichunk) - ipsd = psd_index(idet, baselines_short_time(kstart+1)) + ipsd = psd_index(idet, baselines_short_time(kstart + 1)) - y(0, kstart+1:kstop, idet) = x(0, kstart+1:kstop, idet) + y(0, kstart + 1:kstop, idet) = x(0, kstart + 1:kstop, idet) if (ipsd == -1) then ! no PSD @@ -802,14 +842,14 @@ subroutine convolve_interval(ichunk, idet, x, y, xx, fx, fc, nna) call trim_interval(kstart, kstop, noba, idet, nna) if (noba == 0) exit - xx(1:noba) = x(0, kstart+1:kstart+noba, idet) + xx(1:noba) = x(0, kstart + 1:kstart+noba, idet) xx(1:noba) = xx(1:noba) - sum(xx(1:noba)) / noba - xx(noba+1:) = 0 + xx(noba + 1:) = 0 call dfft(fx, xx) fx = fx * fc(:, ipsd) call dfftinv(xx, fx) xx(1:noba) = xx(1:noba) - sum(xx(1:noba)) / noba - y(0, kstart+1:kstart+noba, idet) = xx(1:noba) + y(0, kstart + 1:kstart+noba, idet) = xx(1:noba) kstart = kstart + noba end do @@ -825,7 +865,7 @@ SUBROUTINE convolve_pp(y, x, fc, nna) real(dp), intent(out) :: y(noba_short, nodetectors) real(dp), intent(in) :: x(noba_short, nodetectors) - complex(dp), intent(in) :: fc(nof/2+1, npsdtot) + complex(dp), intent(in) :: fc(nof / 2 + 1, npsdtot) real(dp), intent(in) :: & nna(0:basis_order, 0:basis_order, noba_short, nodetectors) @@ -833,31 +873,29 @@ SUBROUTINE convolve_pp(y, x, fc, nna) real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() - type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) call reset_time(14) y = x - ! Allocate FFTW work space outside the threaded region to avoid - ! a GCC compiler segfault when freeing the workspace - do id_thread = 0, nthreads - 1 - pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) - pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) - end do - !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(nof, nodetectors, ninterval, noba_short_pp, & - !$OMP baselines_short_time, fc, x, y, nthreads, nna, pxx, pfx) & + !$OMP baselines_short_time, fc, x, y, nthreads, nna, pxx, pfx, & + !$OMP unaligned_fft, xxthreads, fxthreads) & !$OMP PRIVATE(idet, ichunk, m, xx, fx, ierr, id_thread, itask, & !$OMP num_threads) id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - call c_f_pointer(pxx(id_thread), xx, [nof]) - call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1]) + if (unaligned_fft) then + xx => xxthreads(:, id_thread) + fx => fxthreads(:, id_thread) + else + call c_f_pointer(pxx(id_thread), xx, [nof]) + call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1]) + end if itask = -1 do idet = 1, nodetectors @@ -870,11 +908,6 @@ SUBROUTINE convolve_pp(y, x, fc, nna) !$OMP END PARALLEL - do id_thread = 0, nthreads - 1 - call fftw_free(pxx(id_thread)) - call fftw_free(pfx(id_thread)) - end do - cputime_filter = cputime_filter + get_time(14) END SUBROUTINE convolve_pp @@ -897,7 +930,6 @@ SUBROUTINE construct_preconditioner(nna) real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() - type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) if (precond_width_max < 1) return @@ -921,15 +953,15 @@ SUBROUTINE construct_preconditioner(nna) do ichunk = 1, ninterval kstart = sum(noba_short_pp(1:ichunk-1)) noba = noba_short_pp(ichunk) - ipsddet = psd_index_det(idet, baselines_short_time(kstart+1)) + ipsddet = psd_index_det(idet, baselines_short_time(kstart + 1)) if (ipsddet < 0) then ! No PSD available - do k = kstart+1, kstart+noba + do k = kstart + 1, kstart+noba prec_diag(k, idet) = 1. end do cycle end if - do k = kstart+1, kstart+noba + do k = kstart + 1, kstart+noba if (nna(0, 0, k, idet) == 0) then prec_diag(k, idet) = 1. / detectors(idet)%weights(ipsddet) else @@ -943,10 +975,10 @@ SUBROUTINE construct_preconditioner(nna) ! Build fprecond to act as a fall back option if (allocated(fprecond)) deallocate(fprecond) - allocate(fprecond(nof/2+1, npsdtot), stat=ierr) + allocate(fprecond(nof / 2 + 1, npsdtot), stat=ierr) if (ierr /= 0) call abort_mpi('No room for fprecond') fprecond = 0 - memory_precond = memory_precond + (nof/2+1)*npsdtot*16. + memory_precond = memory_precond + (nof / 2 + 1)*npsdtot*16. ipsd = 0 do idet = 1, nodetectors do ipsddet = 1, detectors(idet)%npsd @@ -969,23 +1001,22 @@ SUBROUTINE construct_preconditioner(nna) call allocate_bandprec - ! Allocate FFTW work space outside the threaded region to avoid - ! a GCC compiler segfault when freeing the workspace - do id_thread = 0, nthreads - 1 - pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) - pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) - end do - !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & - !$OMP SHARED(nof, npsdtot, fcov, invcov, pxx, pfx) & + !$OMP SHARED(nof, npsdtot, fcov, invcov, pxx, pfx, & + !$OMP unaligned_fft, xxthreads, fxthreads) & !$OMP PRIVATE(id_thread, num_threads, ipsd, xx, fx) id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - call c_f_pointer(pxx(id_thread), xx, [nof]) - call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1]) + if (unaligned_fft) then + xx => xxthreads(:, id_thread) + fx => fxthreads(:, id_thread) + else + call c_f_pointer(pxx(id_thread), xx, [nof]) + call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1]) + end if do ipsd = 1, npsdtot if (modulo(ipsd-1, num_threads) /= id_thread) cycle @@ -996,11 +1027,6 @@ SUBROUTINE construct_preconditioner(nna) !$OMP END PARALLEL - do id_thread = 0, nthreads - 1 - call fftw_free(pxx(id_thread)) - call fftw_free(pfx(id_thread)) - end do - if (.not. use_cgprecond) then nempty = 0 nfail = 0 @@ -1034,10 +1060,10 @@ SUBROUTINE construct_preconditioner(nna) loop_subchunk : do call trim_interval(kstart, kstop, noba, idet, nna) if (noba == 0) exit - ipsd = psd_index(idet, baselines_short_time(kstart+1)) + ipsd = psd_index(idet, baselines_short_time(kstart + 1)) if (ipsd == -1 .or. noba < 1 .or. & - all(nna(0, 0, kstart+1:kstart+noba, idet) == 0)) then + all(nna(0, 0, kstart + 1:kstart+noba, idet) == 0)) then nempty = nempty + 1 cycle end if @@ -1045,15 +1071,15 @@ SUBROUTINE construct_preconditioner(nna) ! Rough guess for the width of the preconditioner based on ! total power in the band - p_tot = sum(invcov(:nof/2, ipsd)**2) + p_tot = sum(invcov(:nof / 2, ipsd) ** 2) try = 1 ierr = -1 loop_try : do - ! Rows from kstart+1 to kstart+noba for one band-diagonal + ! Rows from kstart + 1 to kstart+noba for one band-diagonal ! submatrix. Do the computation in double precision nband = min(noba, try * precond_width_min) nband = min(nband, precond_width_max) - p = sum(invcov(:nband, ipsd)**2) + p = sum(invcov(:nband, ipsd) ** 2) if (p > plim * p_tot) then ierr = 0 exit @@ -1068,7 +1094,7 @@ SUBROUTINE construct_preconditioner(nna) do call get_cholesky_decomposition( & bandprec(isub, ichunk, idet)%data, nband, noba, & - invcov(1, ipsd), nna(0, 0, kstart+1, idet), ierr) + invcov(1, ipsd), nna(0, 0, kstart + 1, idet), ierr) if (ierr == 0) exit @@ -1088,7 +1114,7 @@ SUBROUTINE construct_preconditioner(nna) !write (*, '(1x,a,i0,a,es18.10,a,es18.10)') & ! 'Cholesky decomposition failed for ' // & ! trim(detectors(idet)%name) // ', noba = ', noba, ', t = ', & - ! baselines_short_time(kstart+1), ' - ', & + ! baselines_short_time(kstart + 1), ' - ', & ! baselines_short_time(kstart+noba) nfail = nfail + 1 else @@ -1110,7 +1136,7 @@ SUBROUTINE construct_preconditioner(nna) print *, 'Constructed ', sum(ntries), ' band preconditioners.' do try = 1, trymax if (ntries(try) > 0) print *, ntries(try), ' at width = ', & - min(try*precond_width_min, precond_width_max) + min(try * precond_width_min, precond_width_max) end do if (nfail > 0) print *, ' ', nfail, ' failed (using CG)' if (nempty > 0) print *, ' Skipped ', nempty, ' empty intervals.' @@ -1118,14 +1144,7 @@ SUBROUTINE construct_preconditioner(nna) end if - memsum = memory_precond / 2**20 - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,'(1x,a,t32,3(f9.1," MB"))') & - 'Allocated memory for precond:', memsum, mem_min, mem_max - end if + call write_memory("Precond memory", memory_precond) cputime_prec_construct = cputime_prec_construct + get_time(16) @@ -1137,7 +1156,7 @@ END SUBROUTINE construct_preconditioner !------------------------------------------------------------------------------ - SUBROUTINE preconditioning_band(z, r, nna) + SUBROUTINE preconditioning_band(z, r, nna, istep) !Apply the preconditioner ! z and r may be 3-dimensional arrays in the calling code but @@ -1147,6 +1166,7 @@ SUBROUTINE preconditioning_band(z, r, nna) real(dp), intent(in), target :: r(0:basis_order, noba_short, nodetectors) real(dp), intent(in) :: & nna(0:basis_order, 0:basis_order, noba_short, nodetectors) + integer, intent(in) :: istep integer :: j, k, idet, kstart, kstop, noba, ichunk, ierr, itask, & num_threads, m, nband, ipsd, isub @@ -1154,7 +1174,6 @@ SUBROUTINE preconditioning_band(z, r, nna) real(C_DOUBLE), pointer :: xx(:) => NULL() complex(C_DOUBLE_COMPLEX), pointer :: fx(:) => NULL() - type(C_PTR) :: pxx(0:nthreads-1), pfx(0:nthreads-1) z = r @@ -1170,18 +1189,12 @@ SUBROUTINE preconditioning_band(z, r, nna) ! Apply the filter-based preconditioner call convolve_pp(z, r, fprecond, nna) else - ! Allocate FFTW work space outside the threaded region to avoid - ! a GCC compiler segfault when freeing the workspace - do id_thread = 0, nthreads - 1 - pxx(id_thread) = fftw_alloc_real(int(nof, C_SIZE_T)) - pfx(id_thread) = fftw_alloc_complex(int(nof/2 + 1, C_SIZE_T)) - end do - !$OMP PARALLEL NUM_THREADS(nthreads) & !$OMP DEFAULT(NONE) & !$OMP SHARED(noba_short_pp, ninterval, bandprec, z, r, nodetectors, & !$OMP id, nof, nshort, detectors, nthreads, fprecond, nna, & - !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx) & + !$OMP baselines_short_time, invcov, fcov, checknan, pxx, pfx, & + !$OMP unaligned_fft, xxthreads, fxthreads) & !$OMP PRIVATE(idet, ichunk, kstart, kstop, noba, j, k, ierr, m, & !$OMP xx, fx, nband, id_thread, itask, num_threads, ipsd, isub, & !$OMP t1, t2, tf) @@ -1191,8 +1204,13 @@ SUBROUTINE preconditioning_band(z, r, nna) id_thread = omp_get_thread_num() num_threads = omp_get_num_threads() - call c_f_pointer(pxx(id_thread), xx, [nof]) - call c_f_pointer(pfx(id_thread), fx, [nof/2 + 1]) + if (unaligned_fft) then + xx => xxthreads(:, id_thread) + fx => fxthreads(:, id_thread) + else + call c_f_pointer(pxx(id_thread), xx, [nof]) + call c_f_pointer(pfx(id_thread), fx, [nof / 2 + 1]) + end if itask = -1 do idet = 1, nodetectors @@ -1205,21 +1223,21 @@ SUBROUTINE preconditioning_band(z, r, nna) if (noba == 0) exit loop_subchunk itask = itask + 1 if (modulo(itask, num_threads) == id_thread) then - ipsd = psd_index_det(idet, baselines_short_time(kstart+1)) + ipsd = psd_index_det(idet, baselines_short_time(kstart + 1)) if (ipsd < 0) cycle if (detectors(idet)%weights(ipsd) == 0) cycle - ipsd = psd_index(idet, baselines_short_time(kstart+1)) + ipsd = psd_index(idet, baselines_short_time(kstart + 1)) if (allocated(bandprec(isub, ichunk, idet)%data)) then ! Use the precomputed Cholesky decomposition nband = size(bandprec(isub, ichunk, idet)%data, 1) call apply_cholesky_decomposition( & - noba, z(0, kstart+1, idet), r(0, kstart+1, idet), & + noba, z(0, kstart + 1, idet), r(0, kstart + 1, idet), & nband, bandprec(isub, ichunk, idet)%data) else ! Use CG iteration to apply the preconditioner call apply_CG_preconditioner( & - noba, nof, fcov(1, ipsd), nna(0, 0, kstart+1, idet), & - z(0, kstart+1, idet), r(0, kstart+1, idet), & + noba, nof, fcov(1, ipsd), nna(0, 0, kstart + 1, idet), & + z(0, kstart + 1, idet), r(0, kstart + 1, idet), & invcov(1, ipsd), fx, xx, tf) end if end if @@ -1231,10 +1249,6 @@ SUBROUTINE preconditioning_band(z, r, nna) !$OMP END PARALLEL - do id_thread = 0, nthreads - 1 - call fftw_free(pxx(id_thread)) - call fftw_free(pfx(id_thread)) - end do end if cputime_precond = cputime_precond + get_time(16) @@ -1289,18 +1303,18 @@ end subroutine get_cholesky_decomposition subroutine apply_CG_preconditioner( & - noba, nof, fcov, nna, x, b, invcov, fx, xx, tf) + noba, nof, fcov, nna, x, b, invcov, fx, xx, tfilter) ! ! Apply the preconditioner using CG iteration ! integer, intent(in) :: noba, nof - complex(dp), intent(in) :: fcov(nof/2+1) + complex(dp), intent(in) :: fcov(nof / 2 + 1) real(dp), intent(in) :: nna(noba), invcov(noba) real(dp), intent(out) :: x(noba) real(dp), intent(in) :: b(noba) - real(dp), intent(inout) :: tf real(C_DOUBLE), intent(inout) :: xx(nof) - complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof/2 + 1) + complex(C_DOUBLE_COMPLEX), intent(inout) :: fx(nof / 2 + 1) + real(dp), intent(out) :: tfilter real(dp), allocatable :: resid(:), prop(:), Aprop(:), zresid(:), precond(:) real(dp) :: alpha, beta, rr, rr0, rr_old, rz, rz_old, Anorm @@ -1328,8 +1342,9 @@ subroutine apply_CG_preconditioner( & prop = zresid iter = 1 + tfilter = 0 do - call apply_A(prop, Aprop, tf) + call apply_A(prop, Aprop) Anorm = dot_product(prop, Aprop) alpha = rz / Anorm x = x + alpha * prop @@ -1360,34 +1375,33 @@ subroutine apply_precond(x, z) z = precond * x end subroutine apply_precond - subroutine apply_A(x, y, tf) + subroutine apply_A(x, y) ! ! Apply the square matrix `A` to `x` and place the result in `y` ! real(dp), intent(in) :: x(noba) real(dp), intent(out) :: y(noba) - real(dp), intent(inout) :: tf - call convolve(x, fcov, y, tf) - y = y + nna*x + call convolve(x, fcov, y) + y = y + nna * x end subroutine apply_A - subroutine convolve(x, fc, y, tf) + subroutine convolve(x, fc, y) ! ! Convolve `x` with filter `fc` and place result in `y`. ! real(dp), intent(in) :: x(noba) - complex(dp) :: fc(nof/2 + 1) + complex(dp) :: fc(nof / 2 + 1) real(dp), intent(out) :: y(noba) - real(dp), intent(inout) :: tf - real(dp) :: t1, t2 - xx(:noba) = x - sum(x)/noba - xx(noba+1:) = 0 - !t1 = get_time(56) + !real(dp) :: t + xx(:noba) = x - sum(x) / noba + xx(noba + 1:) = 0 + !t = MPI_Wtime() call dfft(fx, xx) - fx = fx*fc + !tfilter = tfilter + (MPI_Wtime() - t) + fx = fx * fc + !t = MPI_Wtime() call dfftinv(xx, fx) - !t2 = get_time(56) - !tf = tf + t2 - t1 + !tfilter = tfilter + (MPI_Wtime() - t) y = xx(:noba) y = y - sum(y) / noba end subroutine convolve diff --git a/src/parameter_control.f90 b/src/parameter_control.f90 index c1170e6..ffda8e7 100644 --- a/src/parameter_control.f90 +++ b/src/parameter_control.f90 @@ -8,6 +8,7 @@ MODULE parameter_control use pointing, only : subchunk use maps_and_baselines, only : memory_baselines, memory_maps, & memory_basis_functions + use memory_and_time, only : write_memory implicit none private @@ -589,10 +590,13 @@ SUBROUTINE write_parameters() write (*,fi) 'nside_cross', nside_cross, 'Healpix resolution (destriping)' if (info > 1) & write (*,fi) 'nside_submap', nside_submap, 'Submap resolution' - write (*,fk) 'concatenate_messages', concatenate_messages, & - 'use mpi_alltoallv to communicate' - write (*,fk) 'allreduce', allreduce, & - 'use allreduce to communicate' + if (allreduce) then + write (*,fk) 'allreduce', allreduce, & + 'use allreduce to communicate' + else + write (*,fk) 'concatenate_messages', concatenate_messages, & + 'use mpi_alltoallv to communicate' + end if write (*,fk) 'reassign_submaps', reassign_submaps, & 'minimize communication by reassigning submaps' @@ -620,12 +624,13 @@ SUBROUTINE write_parameters() if (kfirst) then write (*,*) - write (*,fk) 'kfirst',kfirst, 'First destriping ON' - write (*,ff) 'dnshort',dnshort,'Baseline length (samples)' + write (*,fk) 'kfirst', kfirst, 'First destriping ON' + write (*,ff) 'dnshort', dnshort, 'Baseline length (samples)' write (*,ff) ' ',dnshort/fsample,'seconds' if (kfilter) then - write (*,fk) 'kfilter',kfilter,'Noise filter ON' + write (*,fk) 'kfilter', kfilter, 'Noise filter ON' + write (*,fk) 'unaligned_fft', unaligned_fft else write (*,fk) 'kfilter', kfilter, 'Noise filter OFF' write (*,fe) 'diagfilter', diagfilter, 'diagonal baseline filter' @@ -932,16 +937,7 @@ SUBROUTINE start_timeloop() end do end if ! if (kfirst) - memsum = memory_basis_functions / 2**20 - mem_min = memsum - mem_max = memsum - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memsum) - if (ID == 0 .and. info > 0) then - write(*,mstr3) 'Allocated memory for basis_functions:', & - memsum, mem_min, mem_max - end if + call write_memory("Basis function memory", memory_basis_functions) !end if diff --git a/src/pointing.f90 b/src/pointing.f90 index acd2d37..4334c01 100644 --- a/src/pointing.f90 +++ b/src/pointing.f90 @@ -5,7 +5,7 @@ MODULE pointing use commonparam use mpi_wrappers - use memory_and_time, only : check_stat + use memory_and_time, only : check_stat, write_memory implicit none private @@ -59,15 +59,7 @@ SUBROUTINE init_pointing dummy_pixel = 12 * nside_max ** 2 - memory = memory_pointing / 2d0 ** 20 - - mem_min = memory; mem_max = memory - call min_mpi(mem_min) - call max_mpi(mem_max) - call sum_mpi(memory) - - if (id == 0 .and. info > 0) write(*,'(a,t32,3(f12.1," MB"))') & - ' Allocated memory for pointing:', memory, mem_min, mem_max + call write_memory("Pointing memory", memory_pointing) END SUBROUTINE init_pointing diff --git a/src/smadam.F90 b/src/smadam.F90 index 2a04be1..38af411 100644 --- a/src/smadam.F90 +++ b/src/smadam.F90 @@ -104,6 +104,8 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & integer(i2b) :: subchunkcounter integer(i8b) :: pixmin, pixmax + call set_parameter_defaults() + ! set up MPI call init_mpi(comm, ntasks, id) @@ -254,6 +256,10 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & cputime_init = cputime_init + get_time(1) + call tic + call init_filter + if (id == 0) call toc('init_filter') + call tic call build_filter if (id == 0) call toc('build_filter') @@ -453,6 +459,7 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & if (id == 0) write(*,*) if (id == 0) write(*,*) 'MEMORY (MB):' + memory_total = 0 call write_memory('Detector pointing', memory_pointing) call write_memory('TOD buffer', memory_tod) call write_memory('Maps', memory_maps) @@ -495,6 +502,9 @@ subroutine destripe(comm, parstring, ndet, detstring, detweights, & call write_time('Binning TOD', cputime_bin_maps) call write_time('Sending binned TOD', cputime_send_maps) call write_time('Counting hits', cputime_count_hits) + call write_time('Filter - allocate', cputime_filter_allocate) + call write_time('Filter - initialize', cputime_filter_init) + call write_time('Filter - build', cputime_filter_build) call write_time('Building preconditioner', cputime_prec_construct) call write_time('Subtract/add baselines', & cputime_clean_tod+cputime_unclean_tod) @@ -817,17 +827,18 @@ subroutine destripe_with_cache(comm, ndet, nsamp, nnz, & if (id == 0) write(*,*) if (id == 0) write(*,*) 'MEMORY (MB):' - call write_memory('Detector pointing', memory_pointing) - call write_memory('TOD buffer', memory_tod) - call write_memory('Maps', memory_maps) - call write_memory('Baselines', memory_baselines) - call write_memory('Basis functions', memory_basis_functions) - call write_memory('Noise filter', memory_filter) - call write_memory('Preconditioner', memory_precond) - call write_memory('Submap table', memory_ksubmap) - call write_memory('Temporary maps', memory_locmap) - call write_memory('All2All buffers', memory_all2all) - call write_memory('CG work space', memory_cg) + memory_total = 0 + call write_memory('Detector pointing', memory_pointing) + call write_memory('TOD buffer', memory_tod) + call write_memory('Maps', memory_maps) + call write_memory('Baselines', memory_baselines) + call write_memory('Basis functions', memory_basis_functions) + call write_memory('Noise filter', memory_filter) + call write_memory('Preconditioner', memory_precond) + call write_memory('Submap table', memory_ksubmap) + call write_memory('Temporary maps', memory_locmap) + call write_memory('All2All buffers', memory_all2all) + call write_memory('CG work space', memory_cg) call write_memory('NCM', memory_ncm) call write_memory('Total') end if @@ -859,6 +870,9 @@ subroutine destripe_with_cache(comm, ndet, nsamp, nnz, & call write_time('Binning TOD', cputime_bin_maps) call write_time('Sending binned TOD', cputime_send_maps) call write_time('Counting hits', cputime_count_hits) + call write_time('Filter - allocate', cputime_filter_allocate) + call write_time('Filter - initialize', cputime_filter_init) + call write_time('Filter - build', cputime_filter_build) call write_time('Building preconditioner', cputime_prec_construct) call write_time('Subtract/add baselines', & cputime_clean_tod + cputime_unclean_tod) @@ -1350,6 +1364,9 @@ subroutine reset_timers() cputime_send_maps = 0 cputime_count_hits = 0 cputime_prec_construct = 0 + cputime_filter_allocate = 0 + cputime_filter_init = 0 + cputime_filter_build = 0 cputime_cga_init = 0 cputime_cga = 0 diff --git a/src/smadam_routines.f90 b/src/smadam_routines.f90 index 534ae62..37686a3 100644 --- a/src/smadam_routines.f90 +++ b/src/smadam_routines.f90 @@ -701,6 +701,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) real(dp), allocatable, target :: ap_all_threads(:, :, :, :) real(dp), pointer :: ap_thread(:, :, :) real(dp), allocatable :: resid(:) + real(dp) :: t0, t1, t2, t3 if (info > 4) write(*,idf) ID,'Begin iteration...' @@ -730,7 +731,7 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) r = yba - call apply_preconditioner(z, r) + call apply_preconditioner(z, r, -1) p = z rz = sum(r * z, mask=rmask) @@ -872,26 +873,29 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) ! 2) Evaluate p^T.A.p - pap = sum(p*ap, mask=rmask) + call wait_mpi ! DEBUB + pap = sum(p * ap, mask=rmask) call sum_mpi(pap) ! 3) alpha = r.z / (p^T.A.p) - alpha = rz/pap + alpha = rz / pap ro = r ! Keep a copy for Polak-Ribiere beta ! 4) update `aa` and `r` aa(:, 1:noba_short, 1:nodetectors) = & - aa(:, 1:noba_short, 1:nodetectors) + alpha*p - r = r - alpha*ap + aa(:, 1:noba_short, 1:nodetectors) + alpha * p + r = r - alpha * ap ! 5) Precondition - call apply_preconditioner(z, r) + call wait_mpi ! DEBUB + call apply_preconditioner(z, r, istep) ! 6) Check for convergence + call wait_mpi ! DEBUB rzo = rz rz = sum(r * z, mask=rmask) rz2 = sum(ro * z, mask=rmask) @@ -901,21 +905,21 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) call sum_mpi(rr) ! This is the Fletcher-Reeves formula that ! assumes stationary preconditioning - !beta = rz / rzo + ! beta = rz / rzo ! This is the Polak-Ribiere formula that ! allows for updates to the preconditioner beta = (rz - rz2) / rzo - if (ID==0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') & - istep, rr/rrinit, rz2/rz, alpha, beta, get_time_and_reset(99) + if (ID == 0 .and. info > 1) write(*,'(i4,4es16.6," (",f8.3,"s)")') & + istep, rr / rrinit, rz2 / rz, alpha, beta, get_time_and_reset(99) - if (rr/rrinit > 1e3) call abort_mpi('CG is diverging') - if (rr/rrinit < cglimit .and. istep > iter_min) exit + if (rr / rrinit > 1e3) call abort_mpi('CG is diverging') + if (rr / rrinit < cglimit .and. istep > iter_min) exit if (rz == 0) exit ! 7) Update search direction, `p` - p = z + beta*p + p = z + beta * p end do @@ -936,12 +940,13 @@ SUBROUTINE iterate_a(aa, yba, nna, wamap, cca) contains - subroutine apply_preconditioner(z, r) + subroutine apply_preconditioner(z, r, istep) real(dp), intent(in) :: r(0:basis_order, noba_short, nodetectors) real(dp), intent(out) :: z(0:basis_order, noba_short, nodetectors) + integer, intent(in) :: istep integer :: i, idet if (basis_order == 0) then - call preconditioning_band(z, r, nna) + call preconditioning_band(z, r, nna, istep) else do idet = 1, nodetectors do i = 1, noba_short diff --git a/src/tod_storage.f90 b/src/tod_storage.f90 index eae6107..a2acf95 100644 --- a/src/tod_storage.f90 +++ b/src/tod_storage.f90 @@ -6,7 +6,7 @@ MODULE tod_storage use commonparam use mpi_wrappers - use memory_and_time, only : check_stat + use memory_and_time, only : check_stat, write_memory implicit none private @@ -43,14 +43,7 @@ SUBROUTINE allocate_tod memory_tod = memory_tod + nosamples_proc surveyflags = .true. - memsum = memory_tod/1024./1024. - - mem_min = memsum; mem_max = memsum - call min_mpi(mem_min); call max_mpi(mem_max) - call sum_mpi(memsum) - - if (id == 0 .and. info >= 1) & - write(*,mstr3) 'Allocated memory for TOD:', memsum, mem_min, mem_max + call write_memory("TOD memory", memory_tod) END SUBROUTINE allocate_tod