diff --git a/Nwpw/nwpwlib/D3dB/d3db.cpp b/Nwpw/nwpwlib/D3dB/d3db.cpp index 05b2cf1c..4c78ffc4 100644 --- a/Nwpw/nwpwlib/D3dB/d3db.cpp +++ b/Nwpw/nwpwlib/D3dB/d3db.cpp @@ -3182,7 +3182,7 @@ void d3db::cr_fft3d(double *a) indx0 += nxhy2; } - mygdevice.batch_cfftx_tmpx(fft_tag,false, nx, ny * nq, n2ft3d, a, tmpx); + mygdevice.batch_rfftx_tmpx(fft_tag,false, nx, ny * nq, n2ft3d, a, tmpx); } /************************* @@ -3211,7 +3211,7 @@ void d3db::cr_fft3d(double *a) *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - mygdevice.batch_cfftx_tmpx(fft_tag,false, nx, nq1, n2ft3d, a, tmpx); + mygdevice.batch_rfftx_tmpx(fft_tag,false, nx, nq1, n2ft3d, a, tmpx); zeroend_fftb(nx, nq1, 1, 1, a); if (n2ft3d_map < n2ft3d) @@ -3252,7 +3252,7 @@ void d3db::rc_fft3d(double *a) *** do fft along nx dimension *** *** A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] *** ********************************************/ - mygdevice.batch_cfftx_tmpx(fft_tag,true, nx, ny*nq, n2ft3d, a, tmpx); + mygdevice.batch_rfftx_tmpx(fft_tag,true, nx, ny*nq, n2ft3d, a, tmpx); /******************************************** *** do fft along ny dimension *** @@ -3368,7 +3368,7 @@ void d3db::rc_fft3d(double *a) *** do fft along nx dimension *** *** A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] *** ********************************************/ - mygdevice.batch_cfftx_tmpx(fft_tag,true, nx, nq1, n2ft3d, a, tmpx); + mygdevice.batch_rfftx_tmpx(fft_tag,true, nx, nq1, n2ft3d, a, tmpx); c_transpose_ijk(0, a, tmp2, tmp3); diff --git a/Nwpw/nwpwlib/device/gdevice2.cpp b/Nwpw/nwpwlib/device/gdevice2.cpp index b225e16e..fa0ae6de 100644 --- a/Nwpw/nwpwlib/device/gdevice2.cpp +++ b/Nwpw/nwpwlib/device/gdevice2.cpp @@ -143,6 +143,27 @@ void gdevice2::batch_fft_pipeline_mem_init(const int nstages, const int n2ft3d) } + +void gdevice2::batch_rfftx_tmpx(const int tag,bool forward, int nx, int nq, int n2ft3d, + double *a, double *tmpx) { +#if defined(NWPW_CUDA) || defined(NWPW_HIP) + if (mygdevice2->hasgpu) + mygdevice2->batch_rfftx(tag,forward, nx, nq, n2ft3d, a); +#else + mygdevice2->batch_rfftx_tmpx(forward, nx, nq, n2ft3d, a, tmpx); +#endif +} + +void gdevice2::batch_rfftx_stages_tmpx(const int stage, const int tag,bool forward, int nx, int nq, int n2ft3d, + double *a, double *tmpx, int da) { +#if defined(NWPW_CUDA) || defined(NWPW_HIP) + if (mygdevice2->hasgpu) + mygdevice2->batch_rfftx_stages(stage,tag,forward, nx, nq, n2ft3d, a,da); +#endif +} + + + void gdevice2::batch_cfftx_tmpx(const int tag,bool forward, int nx, int nq, int n2ft3d, double *a, double *tmpx) { #if defined(NWPW_CUDA) || defined(NWPW_HIP) diff --git a/Nwpw/nwpwlib/device/gdevice2.hpp b/Nwpw/nwpwlib/device/gdevice2.hpp index 06464a91..2f30cdc8 100644 --- a/Nwpw/nwpwlib/device/gdevice2.hpp +++ b/Nwpw/nwpwlib/device/gdevice2.hpp @@ -47,10 +47,12 @@ Gdevices *mygdevice2; void batch_fft_pipeline_mem_init(const int,const int); + void batch_rfftx_tmpx(const int, bool, int, int, int, double *, double *); void batch_cfftx_tmpx(const int, bool, int, int, int, double *, double *); void batch_cffty_tmpy(const int, bool, int, int, int, double *, double *); void batch_cfftz_tmpz(const int, bool, int, int, int, double *, double *); + void batch_rfftx_stages_tmpx(const int,const int, bool, int, int, int, double *, double *,int); void batch_cfftx_stages_tmpx(const int,const int, bool, int, int, int, double *, double *,int); void batch_cffty_stages_tmpy(const int,const int, bool, int, int, int, double *, double *,int); void batch_cfftz_stages_tmpz(const int,const int, bool, int, int, int, double *, double *,int); diff --git a/Nwpw/nwpwlib/device/gdevices.hpp b/Nwpw/nwpwlib/device/gdevices.hpp index 296b9375..6fc98fea 100644 --- a/Nwpw/nwpwlib/device/gdevices.hpp +++ b/Nwpw/nwpwlib/device/gdevices.hpp @@ -217,7 +217,7 @@ class Gdevices { } } - void batch_cfftx_tmpx(bool forward, int nx, int nq, int n2ft3d, double *a, double *tmpx) + void batch_rfftx_tmpx(bool forward, int nx, int nq, int n2ft3d, double *a, double *tmpx) { int nxh2 = nx + 2; if (forward) @@ -258,6 +258,29 @@ class Gdevices { } } + + void batch_cfftx_tmpx(bool forward, int nx, int nq, int n2ft3d, double *a, double *tmpx) + { + if (forward) + { + int indx = 0; + for (auto q=0; q(dev_mem[ia_dev]), + reinterpret_cast(dev_mem[ia_dev]), + CUFFT_FORWARD)); + } else { + NWPW_CUFFT_ERROR(cufftExecZ2Z( + plan_x[fft_indx], reinterpret_cast(dev_mem[ia_dev]), + reinterpret_cast(dev_mem[ia_dev]), + CUFFT_INVERSE)); + } + + NWPW_CUDA_ERROR(cudaMemcpy(a, dev_mem[ia_dev], n2ft3d * sizeof(double), cudaMemcpyDeviceToHost)); + + inuse[ia_dev] = false; + } + + + + /************************************** + * * + * batch_cfftx_stages * + * * + **************************************/ + void batch_cfftx_stages(const int stage, const int fft_indx, bool forward, int nx, int nq, int n2ft3d, double *a, int da) + { + //int ia_dev = fetch_dev_mem_indx(((size_t)n2ft3d)); + int ia_dev = ifft_dev[da]; + if (stage==0) + { + inuse[ia_dev] = true; + NWPW_CUDA_ERROR(cudaMemcpyAsync(dev_mem[ia_dev],a,n2ft3d*sizeof(double),cudaMemcpyHostToDevice,stream[da])); + } + else if (stage==1) + { + //NWPW_CUDA_ERROR(cudaStreamSynchronize(stream[da])); + if (forward) { + NWPW_CUFFT_ERROR(cufftExecZ2Z(plan_x[fft_indx], + reinterpret_cast(dev_mem[ia_dev]), + reinterpret_cast(dev_mem[ia_dev]), + CUFFT_FORWARD)); + } else { + NWPW_CUFFT_ERROR(cufftExecZ2Z(plan_x[fft_indx], + reinterpret_cast(dev_mem[ia_dev]), + reinterpret_cast(dev_mem[ia_dev]), + CUFFT_INVERSE)); + } + NWPW_CUDA_ERROR(cudaMemcpyAsync(a,dev_mem[ia_dev],n2ft3d*sizeof(double),cudaMemcpyDeviceToHost,stream[da])); + } + else if (stage==2) + { + NWPW_CUDA_ERROR(cudaStreamSynchronize(stream[da])); + inuse[ia_dev] = false; + } + } + + + + + /************************************** diff --git a/Nwpw/nwpwlib/device/gdevices_hip.hpp b/Nwpw/nwpwlib/device/gdevices_hip.hpp index fa7778c2..6e74bcb4 100644 --- a/Nwpw/nwpwlib/device/gdevices_hip.hpp +++ b/Nwpw/nwpwlib/device/gdevices_hip.hpp @@ -152,8 +152,8 @@ class Gdevices { int fftcount = 0; int nxfft[2], nyfft[2], nzfft[2]; - rocfft_plan forward_plan_x[2], forward_plan_y[2], forward_plan_z[2]; - rocfft_plan backward_plan_x[2], backward_plan_y[2], backward_plan_z[2]; + rocfft_plan forward_plan_x[2], forward_plan_y[2], forward_plan_z[2], forward_plan_rx[2]; + rocfft_plan backward_plan_x[2], backward_plan_y[2], backward_plan_z[2], backward_plan_rx[2];; int ifft_dev[15]; int ifft_n; @@ -892,14 +892,23 @@ class Gdevices { size_t length_nz = (size_t)nz; NWPW_ROCFFT_ERROR(rocfft_plan_create( - &forward_plan_x[fftcount], rocfft_placement_inplace, + &forward_plan_rx[fftcount], rocfft_placement_inplace, rocfft_transform_type_real_forward, rocfft_precision_double, (size_t)1, &length_nx, (size_t)nq1, nullptr)); NWPW_ROCFFT_ERROR(rocfft_plan_create( - &backward_plan_x[fftcount], rocfft_placement_inplace, + &backward_plan_rx[fftcount], rocfft_placement_inplace, rocfft_transform_type_real_inverse, rocfft_precision_double, (size_t)1, &length_nx, (size_t)nq1, nullptr)); + NWPW_ROCFFT_ERROR(rocfft_plan_create( + &forward_plan_x[fftcount], rocfft_placement_inplace, + rocfft_transform_type_complex_forward, rocfft_precision_double, + (size_t)1, &length_nx, (size_t)nq1, nullptr)); + NWPW_ROCFFT_ERROR(rocfft_plan_create( + &backward_plan_x[fftcount], rocfft_placement_inplace, + rocfft_transform_type_complex_inverse, rocfft_precision_double, + (size_t)1, &length_nx, (size_t)nq1, nullptr)); + NWPW_ROCFFT_ERROR(rocfft_plan_create( &forward_plan_y[fftcount], rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, @@ -955,55 +964,106 @@ class Gdevices { ndev_mem = 0; } - void batch_cfftx(const int fft_indx, bool forward, int nx, int nq, int n2ft3d, - double *a) { - int ia_dev = fetch_dev_mem_indx(((size_t)n2ft3d)); - NWPW_HIP_ERROR(hipMemcpy(dev_mem[ia_dev], a, n2ft3d * sizeof(double), - hipMemcpyHostToDevice)); + void batch_rfftx(const int fft_indx, bool forward, int nx, int nq, int n2ft3d, double *a) + { + int ia_dev = fetch_dev_mem_indx(((size_t)n2ft3d)); + NWPW_HIP_ERROR(hipMemcpy(dev_mem[ia_dev], a, n2ft3d * sizeof(double), hipMemcpyHostToDevice)); + + if (forward) { + NWPW_ROCFFT_ERROR(rocfft_execute( + forward_plan_rx[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } else { + NWPW_ROCFFT_ERROR(rocfft_execute( + backward_plan_rx[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } + + NWPW_HIP_ERROR(hipMemcpy(a, dev_mem[ia_dev], n2ft3d * sizeof(double), hipMemcpyDeviceToHost)); + + inuse[ia_dev] = false; + } - if (forward) { - NWPW_ROCFFT_ERROR(rocfft_execute( - forward_plan_x[fft_indx], - reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); - } else { - NWPW_ROCFFT_ERROR(rocfft_execute( - backward_plan_x[fft_indx], - reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); - } + void batch_rfftx_stages(const int stage, const int fft_indx, bool forward, + int nx, int nq, int n2ft3d, double *a, int da) + { + // int ia_dev = fetch_dev_mem_indx(((size_t) n2ft3d)); + int ia_dev = ifft_dev[da]; + + if (stage == 0) { + inuse[ia_dev] = true; + NWPW_HIP_ERROR(hipMemcpyAsync(dev_mem[ia_dev], a, n2ft3d * sizeof(double), + hipMemcpyHostToDevice, stream[da])); + } else if (stage == 1) { + // NWPW_HIP_ERROR(hipStreamSynchronize(stream[da])); + if (forward) { + NWPW_ROCFFT_ERROR(rocfft_execute( + forward_plan_rx[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } else { + NWPW_ROCFFT_ERROR(rocfft_execute( + backward_plan_rx[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } + NWPW_HIP_ERROR(hipMemcpyAsync(a, dev_mem[ia_dev], n2ft3d * sizeof(double), + hipMemcpyDeviceToHost, stream[da])); + } else if (stage == 2) { + NWPW_HIP_ERROR(hipStreamSynchronize(stream[da])); + inuse[ia_dev] = false; + } + } - NWPW_HIP_ERROR(hipMemcpy(a, dev_mem[ia_dev], n2ft3d * sizeof(double), - hipMemcpyDeviceToHost)); - inuse[ia_dev] = false; + void batch_cfftx(const int fft_indx, bool forward, int nx, int nq, int n2ft3d, double *a) + { + int ia_dev = fetch_dev_mem_indx(((size_t)n2ft3d)); + NWPW_HIP_ERROR(hipMemcpy(dev_mem[ia_dev], a, n2ft3d * sizeof(double), hipMemcpyHostToDevice)); + + if (forward) { + NWPW_ROCFFT_ERROR(rocfft_execute( + forward_plan_x[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } else { + NWPW_ROCFFT_ERROR(rocfft_execute( + backward_plan_x[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } + + NWPW_HIP_ERROR(hipMemcpy(a, dev_mem[ia_dev], n2ft3d * sizeof(double), hipMemcpyDeviceToHost)); + + inuse[ia_dev] = false; + } + + void batch_cfftx_stages(const int stage, const int fft_indx, bool forward, + int nx, int nq, int n2ft3d, double *a, int da) + { + // int ia_dev = fetch_dev_mem_indx(((size_t) n2ft3d)); + int ia_dev = ifft_dev[da]; + + if (stage == 0) { + inuse[ia_dev] = true; + NWPW_HIP_ERROR(hipMemcpyAsync(dev_mem[ia_dev], a, n2ft3d * sizeof(double), hipMemcpyHostToDevice, stream[da])); + } else if (stage == 1) { + // NWPW_HIP_ERROR(hipStreamSynchronize(stream[da])); + if (forward) { + NWPW_ROCFFT_ERROR(rocfft_execute( + forward_plan_x[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } else { + NWPW_ROCFFT_ERROR(rocfft_execute( + backward_plan_x[fft_indx], + reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); + } + NWPW_HIP_ERROR(hipMemcpyAsync(a, dev_mem[ia_dev], n2ft3d * sizeof(double), + hipMemcpyDeviceToHost, stream[da])); + } else if (stage == 2) { + NWPW_HIP_ERROR(hipStreamSynchronize(stream[da])); + inuse[ia_dev] = false; + } } - void batch_cfftx_stages(const int stage, const int fft_indx, bool forward, - int nx, int nq, int n2ft3d, double *a, int da) { - // int ia_dev = fetch_dev_mem_indx(((size_t) n2ft3d)); - int ia_dev = ifft_dev[da]; - if (stage == 0) { - inuse[ia_dev] = true; - NWPW_HIP_ERROR(hipMemcpyAsync(dev_mem[ia_dev], a, n2ft3d * sizeof(double), - hipMemcpyHostToDevice, stream[da])); - } else if (stage == 1) { - // NWPW_HIP_ERROR(hipStreamSynchronize(stream[da])); - if (forward) { - NWPW_ROCFFT_ERROR(rocfft_execute( - forward_plan_x[fft_indx], - reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); - } else { - NWPW_ROCFFT_ERROR(rocfft_execute( - backward_plan_x[fft_indx], - reinterpret_cast(&(dev_mem[ia_dev])), nullptr, nullptr)); - } - NWPW_HIP_ERROR(hipMemcpyAsync(a, dev_mem[ia_dev], n2ft3d * sizeof(double), - hipMemcpyDeviceToHost, stream[da])); - } else if (stage == 2) { - NWPW_HIP_ERROR(hipStreamSynchronize(stream[da])); - inuse[ia_dev] = false; - } - } + void batch_cffty(const int fft_indx, bool forward, int ny, int nq, int n2ft3d, double *a) { diff --git a/Nwpw/nwpwlib/device/gdevices_sycl.hpp b/Nwpw/nwpwlib/device/gdevices_sycl.hpp index 2e278b24..53b68756 100644 --- a/Nwpw/nwpwlib/device/gdevices_sycl.hpp +++ b/Nwpw/nwpwlib/device/gdevices_sycl.hpp @@ -1084,35 +1084,36 @@ class Gdevices { //ndev_mem = 0; } - - - void batch_cfftx_tmpx(bool forward, int nx, int nq, int n2ft3d, double *a, double *tmpx) - { + + + + void batch_rfftx_tmpx(bool forward, int nx, int nq, int n2ft3d, double *a, double *tmpx) + { int nxh2 = nx + 2; if (forward) { int indx = 0; - for (auto q = 0; q < nq; ++q) - { - drfftf_(&nx, a + indx, tmpx); + for (auto q = 0; q < nq; ++q) + { + drfftf_(&nx, a + indx, tmpx); indx += nxh2; } indx = 1; for (auto j = 0; j < (nq); ++j) { - for (auto i = nx; i >= 2; --i) + for (auto i = nx; i >= 2; --i) { - a[indx + i - 1] = a[indx + i - 2]; + a[indx + i - 1] = a[indx + i - 2]; } a[indx + 1 - 1] = 0.0; a[indx + nx + 1 - 1] = 0.0; indx += nxh2; - } - } + } + } else { int indx = 1; - for (auto j = 0; j < nq; ++j) + for (auto j = 0; j < nq; ++j) { for (auto i = 2; i <= nx; ++i) a[indx + i - 2] = a[indx + i - 1]; @@ -1126,10 +1127,29 @@ class Gdevices { } } } - - - - + + + void batch_cfftx_tmpx(bool forward, int nx, int nq, int n2ft3d, double *a, double *tmpx) + { + if (forward) + { + int indx = 0; + for (auto q = 0; q < nq; ++q) + { + dcfftf_(&nx, a + indx, tmpx); + indx += (2*nx); + } + } + else + { + indx = 0; + for (auto q = 0; q < nq; ++q) + { + dcfftb_(&nx, a + indx, tmpx); + indx += (2*nx); + } + } + } void batch_cffty_tmpy(bool forward, int ny, int nq, int n2ft3d, double *a, double *tmpy) { diff --git a/Nwpw/nwpwlib/lattice/PGrid.cpp b/Nwpw/nwpwlib/lattice/PGrid.cpp index 5accc379..8d6698e8 100644 --- a/Nwpw/nwpwlib/lattice/PGrid.cpp +++ b/Nwpw/nwpwlib/lattice/PGrid.cpp @@ -1031,7 +1031,7 @@ void PGrid::cr_pfft3b(const int nb, double *a) } d3db::zeroend_fftb(nx,ny,nq,1,a); */ - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,false, nx, ny * nq, n2ft3d, a, d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,false, nx, ny * nq, n2ft3d, a, d3db::tmpx); d3db::zeroend_fftb(nx, ny, nq, 1, a); } @@ -1061,7 +1061,7 @@ void PGrid::cr_pfft3b(const int nb, double *a) *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,false, nx, nq1, n2ft3d, a, d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,false, nx, nq1, n2ft3d, a, d3db::tmpx); d3db::zeroend_fftb(nx, nq1, 1, 1, a); if (n2ft3d_map < n2ft3d) @@ -1118,7 +1118,7 @@ void PGrid::rc_pfft3f(const int nb, double *a) d3db::cshift_fftf(nx,ny,nq,1,a); */ - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,true,nx,ny*nq,n2ft3d,a,d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,true,nx,ny*nq,n2ft3d,a,d3db::tmpx); /******************************************** *** do fft along ny dimension *** @@ -1255,7 +1255,7 @@ void PGrid::rc_pfft3f(const int nb, double *a) *** do fft along nx dimension *** *** A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] *** ********************************************/ - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,true, nx, nq1, n2ft3d, a, d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,true, nx, nq1, n2ft3d, a, d3db::tmpx); d3db::c_ptranspose_ijk(nb, 0, a, tmp2, tmp3); @@ -1527,7 +1527,7 @@ void PGrid::pfftbx(const int nb, double *tmp1, double *tmp2, int request_indx) *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx); d3db::zeroend_fftb(nx, ny, nq, 1, tmp2); std::memcpy(tmp1, tmp2, n2ft3d * sizeof(double)); } @@ -1542,7 +1542,7 @@ void PGrid::pfftbx(const int nb, double *tmp1, double *tmp2, int request_indx) *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx); d3db::zeroend_fftb(nx, nq1, 1, 1, tmp1); if (n2ft3d_map < n2ft3d) std::memset(tmp1 + n2ft3d_map, 0, (n2ft3d - n2ft3d_map) * sizeof(double)); @@ -2008,7 +2008,7 @@ void PGrid::pfftbx_start(const int nb, double *tmp1, double *tmp2, int request_i *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_stages_tmpx(0,d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(0,d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx,da_indx); } /************************* **** hilbert mapping **** @@ -2021,7 +2021,7 @@ void PGrid::pfftbx_start(const int nb, double *tmp1, double *tmp2, int request_i *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_stages_tmpx(0,d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(0,d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx,da_indx); } } @@ -2042,7 +2042,7 @@ void PGrid::pfftbx_compute(const int nb, double *tmp1, double *tmp2, int request *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_stages_tmpx(1,d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(1,d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx,da_indx); } /************************* @@ -2054,7 +2054,7 @@ void PGrid::pfftbx_compute(const int nb, double *tmp1, double *tmp2, int request *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_stages_tmpx(1,d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(1,d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx,da_indx); } } @@ -2076,7 +2076,7 @@ void PGrid::pfftbx_end(const int nb, double *tmp1, double *tmp2, int request_ind *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_stages_tmpx(2,d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(2,d3db::fft_tag,false, nx, ny * nq, n2ft3d, tmp2, d3db::tmpx,da_indx); d3db::zeroend_fftb(nx, ny, nq, 1, tmp2); std::memcpy(tmp1, tmp2, n2ft3d * sizeof(double)); } @@ -2089,7 +2089,7 @@ void PGrid::pfftbx_end(const int nb, double *tmp1, double *tmp2, int request_ind *** do fft along kx dimension *** *** A(nx,ny,nz) <- fft1d^(-1)[A(kx,ny,nz)] *** ************************************************/ - d3db::mygdevice.batch_cfftx_stages_tmpx(2,d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(2,d3db::fft_tag,false, nx, nq1, n2ft3d, tmp1, d3db::tmpx,da_indx); d3db::zeroend_fftb(nx, nq1, 1, 1, tmp1); if (n2ft3d_map < n2ft3d) std::memset(tmp1 + n2ft3d_map, 0, (n2ft3d - n2ft3d_map) * sizeof(double)); @@ -2238,7 +2238,7 @@ void PGrid::pfftfx(const int nb, double *a, double *tmp1, double *tmp2, int requ if (maptype == 1) { // do fft along nx dimension - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,true, nx, ny*nq, n2ft3d, a, d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,true, nx, ny*nq, n2ft3d, a, d3db::tmpx); std::memcpy(tmp1, a, n2ft3d * sizeof(double)); } /**** hilbert mapping ****/ @@ -2246,7 +2246,7 @@ void PGrid::pfftfx(const int nb, double *a, double *tmp1, double *tmp2, int requ { // do fft along nx dimension // A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] - d3db::mygdevice.batch_cfftx_tmpx(d3db::fft_tag,true, nx, nq1, n2ft3d, a, d3db::tmpx); + d3db::mygdevice.batch_rfftx_tmpx(d3db::fft_tag,true, nx, nq1, n2ft3d, a, d3db::tmpx); d3db::c_ptranspose_ijk_start(nb, 0, a, tmp1, tmp2, request_indx, 40); } } @@ -2460,7 +2460,7 @@ void PGrid::pfftfx_start(const int nb, double *a, double *tmp1, double *tmp2, in { // do fft along nx dimension std::memcpy(tmp2, a, n2ft3d * sizeof(double)); - d3db::mygdevice.batch_cfftx_stages_tmpx(0,d3db::fft_tag,true, nx, ny*nq, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(0,d3db::fft_tag,true, nx, ny*nq, n2ft3d, tmp2, d3db::tmpx,da_indx); } /**** hilbert mapping ****/ else @@ -2468,7 +2468,7 @@ void PGrid::pfftfx_start(const int nb, double *a, double *tmp1, double *tmp2, in // do fft along nx dimension // A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] std::memcpy(tmp2, a, n2ft3d * sizeof(double)); - d3db::mygdevice.batch_cfftx_stages_tmpx(0,d3db::fft_tag,true, nx, nq1, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(0,d3db::fft_tag,true, nx, nq1, n2ft3d, tmp2, d3db::tmpx,da_indx); } } @@ -2483,14 +2483,14 @@ void PGrid::pfftfx_compute(const int nb, double *a, double *tmp1, double *tmp2, if (maptype == 1) { // do fft along nx dimension - d3db::mygdevice.batch_cfftx_stages_tmpx(1,d3db::fft_tag,true, nx, ny*nq, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(1,d3db::fft_tag,true, nx, ny*nq, n2ft3d, tmp2, d3db::tmpx,da_indx); } /**** hilbert mapping ****/ else { // do fft along nx dimension // A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] - d3db::mygdevice.batch_cfftx_stages_tmpx(1,d3db::fft_tag,true, nx, nq1, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(1,d3db::fft_tag,true, nx, nq1, n2ft3d, tmp2, d3db::tmpx,da_indx); } } @@ -2506,7 +2506,7 @@ void PGrid::pfftfx_end(const int nb, double *a, double *tmp1, double *tmp2, int if (maptype == 1) { // do fft along nx dimension - d3db::mygdevice.batch_cfftx_stages_tmpx(2,d3db::fft_tag,true, nx, ny*nq, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(2,d3db::fft_tag,true, nx, ny*nq, n2ft3d, tmp2, d3db::tmpx,da_indx); std::memcpy(tmp1, tmp2, n2ft3d * sizeof(double)); } /**** hilbert mapping ****/ @@ -2514,7 +2514,7 @@ void PGrid::pfftfx_end(const int nb, double *a, double *tmp1, double *tmp2, int { // do fft along nx dimension // A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] - d3db::mygdevice.batch_cfftx_stages_tmpx(2,d3db::fft_tag,true, nx, nq1, n2ft3d, tmp2, d3db::tmpx,da_indx); + d3db::mygdevice.batch_rfftx_stages_tmpx(2,d3db::fft_tag,true, nx, nq1, n2ft3d, tmp2, d3db::tmpx,da_indx); d3db::c_ptranspose_ijk_start(nb, 0, tmp2, tmp1, tmp2, request_indx, 40);