Why is my GPU version slower than CPU version #433

qinshucheng · 2023-12-08T03:49:15Z

qinshucheng
Dec 8, 2023

ahbarnett · 2023-12-08T03:59:02Z

ahbarnett
Dec 8, 2023
Maintainer

Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem.

…

On Thu, Dec 7, 2023 at 10:49 PM qinshucheng ***@***.***> wrote: — Reply to this email directly, view it on GitHub <#385>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI> . You are receiving this because you are subscribed to this thread.Message ID: ***@***.***>

-- *-------------------------------------------------------------------~^`^~._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

0 replies

qinshucheng · 2023-12-08T05:15:27Z

qinshucheng
Dec 8, 2023
Author

Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem.
…
On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.> wrote: — Reply to this email directly, view it on GitHub <#385>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI . You are receiving this because you are subscribed to this thread.Message ID: @.>
-- *-------------------------------------------------------------------~~^`^~~._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

Here is my setup:
CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores
GPU: GeForce RTX3090 24GB

All runs on a centos7 system

Dimension of data:
[3458 36 24] <-> [256 256 24]

Time elapsed:
finufft cpu : 0.119s
cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s)

The code:

    int64_t m_iSamples = 3458;
    int64_t m_iSpokes = 36;
    int64_t m_iROSize = 256;
    int64_t m_iPESize = 256;
    int64_t m_iSPESize = 24;
    int64_t m_iCHSize = 1;


    std::shared_ptr<CFloat> KspRank(new CFloat[m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize]);
    CFloat* pKspSrc = KspRank.get();

    std::shared_ptr<CFloat> ImgRank(new CFloat[m_iROSize * m_iPESize * m_iSPESize * m_iCHSize]);
    CFloat* pImgDes = ImgRank.get();
    
    std::shared_ptr<float> kTraj(new float[m_iSamples * m_iSpokes * 2]);
    float* m_iKtraj = kTraj.get();

    // My pre-saved data
    ReadBin(sFile + "ksp_3458x36x24.bin", pKspSrc, m_iSamples * m_iSpokes * m_iSPESize);
    ReadBin(sFile + "traj_3458x36x2.bin", m_iKtraj, m_iSamples * m_iSpokes * 2); //[-pi, pi)

    for (int64_t ii = 1;ii < m_iCHSize;++ii)
    {
        memcpy(pKspSrc + ii * m_iSamples * m_iSpokes * m_iSPESize, pKspSrc, sizeof(CFloat) * m_iSamples * m_iSpokes * m_iSPESize);
    }

    int iNumThread = std::min<int>((int)m_iSPESize * m_iCHSize, omp_get_max_threads());


    // CPU
    boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time();
    finufft_opts m_optsB;
    finufft_default_opts(&m_optsB);
    m_optsB.upsampfac = 2.0;
    m_optsB.nthreads = iNumThread;
    finufftf2d1many(m_iSPESize * m_iCHSize, m_iSamples * m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples * m_iSpokes, pKspSrc,
        1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB);

    boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC;
    std::cout << "finufftf cpu    Elapsed    time=    " << durationC << std::endl;


    // GPU
    float* m_d_x, * m_d_y;
    cuFloatComplex* m_d_c, * m_d_fk;
    cufinufftf_plan m_dplan;
    cudaMalloc(&m_d_x, m_iSamples * m_iSpokes * sizeof(float));
    cudaMalloc(&m_d_y, m_iSamples * m_iSpokes * sizeof(float));
    cudaMalloc(&m_d_c, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex));
    cudaMalloc(&m_d_fk, m_iROSize * m_iPESize * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex));

    cufinufft_opts m_gpuopts;
    cufinufft_default_opts(&m_gpuopts);

    m_gpuopts.gpu_method = 1;
    m_gpuopts.gpu_binsizex = 32;
    m_gpuopts.gpu_binsizey = 32;

    boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time();
    cudaMemcpy(m_d_x, m_iKtraj, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(m_d_y, m_iKtraj + m_iSamples * m_iSpokes, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(m_d_c, pKspSrc, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);

    int64_t nmodes[3];
    nmodes[0] = m_iROSize;
    nmodes[1] = m_iPESize;
    nmodes[2] = 1;
    int ier;
    boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1;
    std::cout << "cufinufftf HostToDevice    Elapsed    time=    " << duration1 << std::endl;

    boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time();
    ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize * m_iCHSize, 1e-6, &m_dplan, &m_gpuopts);

    ier = cufinufftf_setpts(m_dplan, m_iSamples * m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL);

    ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk);

    ier = cufinufftf_destroy(m_dplan);
    boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2;
    std::cout << "cufinufftf running    Elapsed    time=    " << duration2 << std::endl;

    boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time();
    cudaMemcpy(pImgDes, m_d_fk, m_iROSize * m_iPESize * m_iSPESize* m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
    boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::local_time() - t3;
    std::cout << "cufinufftf DeviceToHost    Elapsed    time=    " << duration3 << std::endl;

0 replies

ahbarnett · 2023-12-08T20:05:50Z

ahbarnett
Dec 8, 2023
Maintainer

Thanks! I did not attempt to understand your code. Main facts are: what is N (ie, N1, N2), M (# pts), ntransforms? But then from your 2d1 call code it looks like ntrans=24, M = 36*3458, N1=N2=256. eps=1e-6. Indeed on my 8-core Ryzen2 laptop I get similar CPU perf to your report (testing random data): ``` test/finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.128 s 2.34e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.06e-07 ``` I just tested the GPU for the same problem, on a A6000 (roughly a GTX 3090): we actually see that method=1 (GM-sort) is twice as fast as method=2 (SM). So I recommend the former. Note that at 2.3e8 NU pts/s it is 10x the above CPU throughput (after planning): ``` ./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00114 s [time ] cufinufft plan: 0.00991 s [time ] cufinufft setNUpts: 0.000204 s [time ] cufinufft exec: 0.0129 s [time ] cufinufft destroy: 0.000194 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.32e+04 us, speed 1.29e+08 NUpts/s (exec-only thoughput: 2.32e+08 NU pts/s) ./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00115 s [time ] cufinufft plan: 0.000575 s [time ] cufinufft setNUpts: 0.000282 s [time ] cufinufft exec: 0.0241 s [time ] cufinufft destroy: 0.000121 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.5e+04 us, speed 1.19e+08 NUpts/s (exec-only thoughput: 1.24e+08 NU pts/s) ``` So, the first task is for you to report the above tests on your hardware (eg, use FINUFFT_BUILD_TESTS=ON in cmake). Second task is for you to see why you're not getting the GPU thoughput that the tester reports. Is all your time taken in planning or setpts? FWIW a fancy desktop (16-core Xeon gold 6244) gets less than twice the laptop speed: ``` OMP_NUM_THREADS=16 ./finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.0743 s 4.02e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.15e-07 ``` Sometimes with CPU it's better to limit the threads to 1/core or even less. PS you can improve all the above timings a bit (maybe 40-70% faster) by going to single-precision. I just checked the GPU and it gets about 4e8 NU pts/s in single-prec. You should aim for this in your MRI application. Upshot is you should get 5-10x over decent CPU with a decent GPU. Does this help? Best, Alex On Fri, Dec 8, 2023 at 12:15 AM qinshucheng ***@***.***> wrote:

…

Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem. … <#m_-1773919210984028173_> On Thu, Dec 7, 2023 at 10:49 PM qinshucheng *@*.*> wrote: — Reply to this email directly, view it on GitHub <#385 <#385>>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI <https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI> . You are receiving this because you are subscribed to this thread.Message ID: @.*> -- *-------------------------------------------------------------------^`^._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942 *Here is my setup:* CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores GPU: GeForce RTX3090 24GB All runs on a centos7 system *Dimension of data:* [3458 36 24] <-> [256 256 24] *Time elapsed:* finufft cpu : 0.119s cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s) *The code:* int64_t m_iSamples = 3458; int64_t m_iSpokes = 36; int64_t m_iROSize = 256; int64_t m_iPESize = 256; int64_t m_iSPESize = 24; int64_t m_iCHSize = 1; std::shared_ptr KspRank(new CFloat[m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize]); CFloat* pKspSrc = KspRank.get(); std::shared_ptr ImgRank(new CFloat[m_iROSize * m_iPESize * m_iSPESize * m_iCHSize]); CFloat* pImgDes = ImgRank.get(); std::shared_ptr kTraj(new float[m_iSamples * m_iSpokes * 2]); float* m_iKtraj = kTraj.get(); // My pre-saved data ReadBin(sFilePathSubspace + "ksp_3458x36x24.bin", pKspSrc, m_iSamples * m_iSpokes * m_iSPESize); ReadBin(sFilePathSubspace + "traj_3458x36x2.bin", m_iKtraj, m_iSamples * m_iSpokes * 2); //[-pi, pi) for (int64_t ii = 1;ii < m_iCHSize;++ii) { memcpy(pKspSrc + ii * m_iSamples * m_iSpokes * m_iSPESize, pKspSrc, sizeof(CFloat) * m_iSamples * m_iSpokes * m_iSPESize); } int iNumThread = std::min((int)m_iSPESize * m_iCHSize, omp_get_max_threads()); // CPU boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time(); finufft_opts m_optsB; finufft_default_opts(&m_optsB); m_optsB.upsampfac = 2.0; m_optsB.nthreads = iNumThread; finufftf2d1many(m_iSPESize * m_iCHSize, m_iSamples * m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples * m_iSpokes, pKspSrc, 1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB); boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC; std::cout << "finufftf cpu Elapsed time= " << durationC << std::endl; // GPU float* m_d_x, * m_d_y; cuFloatComplex* m_d_c, * m_d_fk; cufinufftf_plan m_dplan; cudaMalloc(&m_d_x, m_iSamples * m_iSpokes * sizeof(float)); cudaMalloc(&m_d_y, m_iSamples * m_iSpokes * sizeof(float)); cudaMalloc(&m_d_c, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex)); cudaMalloc(&m_d_fk, m_iROSize * m_iPESize * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex)); cufinufft_opts m_gpuopts; cufinufft_default_opts(&m_gpuopts); m_gpuopts.gpu_method = 1; m_gpuopts.gpu_binsizex = 32; m_gpuopts.gpu_binsizey = 32; boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(m_d_x, m_iKtraj, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_y, m_iKtraj + m_iSamples * m_iSpokes, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_c, pKspSrc, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyHostToDevice); int64_t nmodes[3]; nmodes[0] = m_iROSize; nmodes[1] = m_iPESize; nmodes[2] = 1; int ier; boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1; std::cout << "cufinufftf HostToDevice Elapsed time= " << duration1 << std::endl; boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time(); ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize * m_iCHSize, 1e-6, &m_dplan, &m_gpuopts); ier = cufinufftf_setpts(m_dplan, m_iSamples * m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL); ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk); ier = cufinufftf_destroy(m_dplan); boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2; std::cout << "cufinufftf running Elapsed time= " << duration2 << std::endl; boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(pImgDes, m_d_fk, m_iROSize * m_iPESize * m_iSPESize* m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost); boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::local_time() - t3; std::cout << "cufinufftf DeviceToHost Elapsed time= " << duration3 << std::endl; — Reply to this email directly, view it on GitHub <#385 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ACNZRSXALBUK2FNOPFJMTNLYIKO7TAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBWGU2TSMRTGI> . You are receiving this because you commented.Message ID: ***@***.***>

-- *-------------------------------------------------------------------~^`^~._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

0 replies

ahbarnett · 2023-12-08T20:35:30Z

ahbarnett
Dec 8, 2023
Maintainer

If you only want 3 digits, and you pack together 96 slices not 24, you can get throughput of a billion NU pts/sec:

(base) ahb@ccmlin019 /mnt/home/ahb/numerics/finufft/build/test/cuda> ./cufinufft2dmany_test 1 1 256 256 96 0 124488 1e-3 1e-2 f
#modes = 65536, #inputs = 96, #NUpts = 124488
[time  ] dummy warmup call to CUFFT	 0.00174 s
[time  ] cufinufft plan:		 0.000557 s
[time  ] cufinufft setNUpts:		 0.000191 s
[time  ] cufinufft exec:		 0.0113 s
[time  ] cufinufft destroy:		 0.000108 s
[gpu   ] 95th data one mode: rel err in F[94,66] is 0.00014
[totaltime] 1.22e+04 us, speed 9.83e+08 NUpts/s
					(exec-only thoughput: 1.06e+09 NU pts/s)

Notice the setup time is now negligible vs the exec time. This is what you should be seeing (ignoring H2D, D2H, which you have already measured). Let us know what you find. Best, Alex

0 replies

qinshucheng · 2023-12-11T10:41:08Z

qinshucheng
Dec 11, 2023
Author

Thanks a lot! It helps!

Actually I got similar results from those tests, it seems the most time-comsuming steps are warming up call to CUFFT and planning in my setups, which I did not time them separately at the first time.

I run those tests on the same setup mentioned.

test/finufft2dmany_test 24 256 256 124488 1e-6
test 2d1 many vs repeated single: ------------------------------------
ntr=24: 124488 NU pts to (256,256) modes in 0.0617 s    4.84e+07 NU pts/s
one mode: rel err in F[94,66] of trans#23 is 1.06e-07

./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.164 s
[time  ] cufinufft plan:                 0.0357 s
[time  ] cufinufft setNUpts:             0.000312 s
[time  ] cufinufft exec:                 0.0136 s
[time  ] cufinufft destroy:              0.00375 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 6.46e-07
[totaltime] 5.34e+04 us, speed 5.6e+07 NUpts/s
(exec-only thoughput: 2.19e+08 NU pts/s)

./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.165 s
[time  ] cufinufft plan:                 0.0355 s
[time  ] cufinufft setNUpts:             0.000328 s
[time  ] cufinufft exec:                 0.0261 s
[time  ] cufinufft destroy:              0.00375 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 6.46e-07
[totaltime] 6.57e+04 us, speed 4.55e+07 NUpts/s
(exec-only thoughput: 1.14e+08 NU pts/s)

Method=1 (GM-sort) is likely twice as fast as method=2(SM)

./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 f
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.228 s
[time  ] cufinufft plan:                 0.0981 s
[time  ] cufinufft setNUpts:             0.000168 s
[time  ] cufinufft exec:                 0.00743 s
[time  ] cufinufft destroy:              0.00424 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 9.74e-06
[totaltime] 1.1e+05 us, speed 2.72e+07 NUpts/s
(exec-only thoughput: 4.02e+08 NU pts/s)

And single-precision is about x2 faster.

./cufinufft2dmany_test 1 1 256 256 96 0 124488 1e-3 1e-2 f
#modes = 65536, #inputs = 96, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.229 s
[time  ] cufinufft plan:                 0.0981 s
[time  ] cufinufft setNUpts:             0.000184 s
[time  ] cufinufft exec:                 0.0113 s
[time  ] cufinufft destroy:              0.00431 s
[gpu   ] 95th data one mode: rel err in F[94,66] is 0.00014
[totaltime] 1.14e+05 us, speed 1.05e+08 NUpts/s
(exec-only thoughput: 1.05e+09 NU pts/s)

When I set 96 slices not 24, the exec time is pretty the same with your results.

So the main problem is time spent warming up call to CUFFT and planning, which I am so confused.
However, this two steps just need to do once in my application, it wont be a serious problem.

Thanks again!

Thanks! I did not attempt to understand your code. Main facts are: what is N (ie, N1, N2), M (# pts), ntransforms? But then from your 2d1 call code it looks like ntrans=24, M = 363458, N1=N2=256. eps=1e-6. Indeed on my 8-core Ryzen2 laptop I get similar CPU perf to your report (testing random data): test/finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.128 s 2.34e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.06e-07 I just tested the GPU for the same problem, on a A6000 (roughly a GTX 3090): we actually see that method=1 (GM-sort) is twice as fast as method=2 (SM). So I recommend the former. Note that at 2.3e8 NU pts/s it is 10x the above CPU throughput (after planning): ./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00114 s [time ] cufinufft plan: 0.00991 s [time ] cufinufft setNUpts: 0.000204 s [time ] cufinufft exec: 0.0129 s [time ] cufinufft destroy: 0.000194 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.32e+04 us, speed 1.29e+08 NUpts/s (exec-only thoughput: 2.32e+08 NU pts/s) ./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00115 s [time ] cufinufft plan: 0.000575 s [time ] cufinufft setNUpts: 0.000282 s [time ] cufinufft exec: 0.0241 s [time ] cufinufft destroy: 0.000121 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.5e+04 us, speed 1.19e+08 NUpts/s (exec-only thoughput: 1.24e+08 NU pts/s) So, the first task is for you to report the above tests on your hardware (eg, use FINUFFT_BUILD_TESTS=ON in cmake). Second task is for you to see why you're not getting the GPU thoughput that the tester reports. Is all your time taken in planning or setpts? FWIW a fancy desktop (16-core Xeon gold 6244) gets less than twice the laptop speed: OMP_NUM_THREADS=16 ./finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.0743 s 4.02e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.15e-07 Sometimes with CPU it's better to limit the threads to 1/core or even less. PS you can improve all the above timings a bit (maybe 40-70% faster) by going to single-precision. I just checked the GPU and it gets about 4e8 NU pts/s in single-prec. You should aim for this in your MRI application. Upshot is you should get 5-10x over decent CPU with a decent GPU. Does this help? Best, Alex On Fri, Dec 8, 2023 at 12:15 AM qinshucheng @.> wrote:
…
Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem. … <#m_-1773919210984028173_> On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.> wrote: — Reply to this email directly, view it on GitHub <#385 <#385>>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI . You are receiving this because you are subscribed to this thread.Message ID: @.> -- *-------------------------------------------------------------------^^._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942 *Here is my setup:* CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores GPU: GeForce RTX3090 24GB All runs on a centos7 system *Dimension of data:* [3458 36 24] <-> [256 256 24] *Time elapsed:* finufft cpu : 0.119s cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s) *The code:* int64_t m_iSamples = 3458; int64_t m_iSpokes = 36; int64_t m_iROSize = 256; int64_t m_iPESize = 256; int64_t m_iSPESize = 24; int64_t m_iCHSize = 1; std::shared_ptr KspRank(new CFloat[m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize]); CFloat* pKspSrc = KspRank.get(); std::shared_ptr ImgRank(new CFloat[m_iROSize * m_iPESize * m_iSPESize * m_iCHSize]); CFloat* pImgDes = ImgRank.get(); std::shared_ptr kTraj(new float[m_iSamples * m_iSpokes * 2]); float* m_iKtraj = kTraj.get(); // My pre-saved data ReadBin(sFilePathSubspace + "ksp_3458x36x24.bin", pKspSrc, m_iSamples * m_iSpokes * m_iSPESize); ReadBin(sFilePathSubspace + "traj_3458x36x2.bin", m_iKtraj, m_iSamples * m_iSpokes * 2); //[-pi, pi) for (int64_t ii = 1;ii < m_iCHSize;++ii) { memcpy(pKspSrc + ii * m_iSamples * m_iSpokes * m_iSPESize, pKspSrc, sizeof(CFloat) * m_iSamples * m_iSpokes * m_iSPESize); } int iNumThread = std::min((int)m_iSPESize * m_iCHSize, omp_get_max_threads()); // CPU boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time(); finufft_opts m_optsB; finufft_default_opts(&m_optsB); m_optsB.upsampfac = 2.0; m_optsB.nthreads = iNumThread; finufftf2d1many(m_iSPESize * m_iCHSize, m_iSamples * m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples * m_iSpokes, pKspSrc, 1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB); boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC; std::cout << "finufftf cpu Elapsed time= " << durationC << std::endl; // GPU float* m_d_x, * m_d_y; cuFloatComplex* m_d_c, * m_d_fk; cufinufftf_plan m_dplan; cudaMalloc(&m_d_x, m_iSamples * m_iSpokes * sizeof(float)); cudaMalloc(&m_d_y, m_iSamples * m_iSpokes * sizeof(float)); cudaMalloc(&m_d_c, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex)); cudaMalloc(&m_d_fk, m_iROSize * m_iPESize * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex)); cufinufft_opts m_gpuopts; cufinufft_default_opts(&m_gpuopts); m_gpuopts.gpu_method = 1; m_gpuopts.gpu_binsizex = 32; m_gpuopts.gpu_binsizey = 32; boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(m_d_x, m_iKtraj, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_y, m_iKtraj + m_iSamples * m_iSpokes, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_c, pKspSrc, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyHostToDevice); int64_t nmodes[3]; nmodes[0] = m_iROSize; nmodes[1] = m_iPESize; nmodes[2] = 1; int ier; boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1; std::cout << "cufinufftf HostToDevice Elapsed time= " << duration1 << std::endl; boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time(); ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize * m_iCHSize, 1e-6, &m_dplan, &m_gpuopts); ier = cufinufftf_setpts(m_dplan, m_iSamples * m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL); ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk); ier = cufinufftf_destroy(m_dplan); boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2; std::cout << "cufinufftf running Elapsed time= " << duration2 << std::endl; boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(pImgDes, m_d_fk, m_iROSize * m_iPESize * m_iSPESize* m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost); boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::local_time() - t3; std::cout << "cufinufftf DeviceToHost Elapsed time= " << duration3 << std::endl; — Reply to this email directly, view it on GitHub <[#385 (comment)](https://github.com/flatironinstitute/finufft/issues/385#issuecomment-1846559232)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ACNZRSXALBUK2FNOPFJMTNLYIKO7TAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBWGU2TSMRTGI> . You are receiving this because you commented.Message ID: ***@***.***> -- *-------------------------------------------------------------------~^^~~._.~~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

0 replies

ahbarnett · 2023-12-11T16:43:59Z

ahbarnett
Dec 11, 2023
Maintainer

Yeah, looks like your dummy (warmup) call to CUFFT is 0.2 s, very slow. I seems to remember old V100's having this problem. Could it be a cuda version issue? Maybe search re CUFFT. But, just insert such a dummy call at the start of your code. But your GPU plan times are 10x longer than mine. But as you say, you only have to plan once since the k-space points stay fixed as you iterate (I think!). Your execute times are same as mine, as you say - great! Let me know if you're able to achieve this speed in your MRI application (apart from H2D, D2H, which is inevitable, unless you keep the whole iteration on the GPU, which is the best way!). Cheers, Alex On Mon, Dec 11, 2023 at 5:41 AM qinshucheng ***@***.***> wrote:

…

Thanks a lot! It helps! Actually I got similar results from those tests, it seems the most time-comsuming steps are warming up call to CUFFT and planning in my setups, which I did not time them separately at the first time. I run those tests on the same setup mentioned. test/finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.0617 s 4.84e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.06e-07 ./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.164 s [time ] cufinufft plan: 0.0357 s [time ] cufinufft setNUpts: 0.000312 s [time ] cufinufft exec: 0.0136 s [time ] cufinufft destroy: 0.00375 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 5.34e+04 us, speed 5.6e+07 NUpts/s (exec-only thoughput: 2.19e+08 NU pts/s) ./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.165 s [time ] cufinufft plan: 0.0355 s [time ] cufinufft setNUpts: 0.000328 s [time ] cufinufft exec: 0.0261 s [time ] cufinufft destroy: 0.00375 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 6.57e+04 us, speed 4.55e+07 NUpts/s (exec-only thoughput: 1.14e+08 NU pts/s) Method=1 (GM-sort) is likely twice as fast as method=2(SM) ./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 f #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.228 s [time ] cufinufft plan: 0.0981 s [time ] cufinufft setNUpts: 0.000168 s [time ] cufinufft exec: 0.00743 s [time ] cufinufft destroy: 0.00424 s [gpu ] 23th data one mode: rel err in F[94,66] is 9.74e-06 [totaltime] 1.1e+05 us, speed 2.72e+07 NUpts/s (exec-only thoughput: 4.02e+08 NU pts/s) And single-precision is about x2 faster. ./cufinufft2dmany_test 1 1 256 256 96 0 124488 1e-3 1e-2 f #modes = 65536, #inputs = 96, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.229 s [time ] cufinufft plan: 0.0981 s [time ] cufinufft setNUpts: 0.000184 s [time ] cufinufft exec: 0.0113 s [time ] cufinufft destroy: 0.00431 s [gpu ] 95th data one mode: rel err in F[94,66] is 0.00014 [totaltime] 1.14e+05 us, speed 1.05e+08 NUpts/s (exec-only thoughput: 1.05e+09 NU pts/s) When I set 96 slices not 24, the exec time is pretty the same with your results. So the main problem is time spent warming up call to CUFFT and planning, which I am so confused. However, this two steps just need to do once in my application, it wont be a serious problem. Thanks again! Thanks! I did not attempt to understand your code. Main facts are: what is N (ie, N1, N2), M (# pts), ntransforms? But then from your 2d1 call code it looks like ntrans=24, M = 36*3458, N1=N2=256. eps=1e-6. Indeed on my 8-core Ryzen2 laptop I get similar CPU perf to your report (testing random data): test/finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.128 s 2.34e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.06e-07 I just tested the GPU for the same problem, on a A6000 (roughly a GTX 3090): we actually see that method=1 (GM-sort) is twice as fast as method=2 (SM). So I recommend the former. Note that at 2.3e8 NU pts/s it is 10x the above CPU throughput (after planning): ./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00114 s [time ] cufinufft plan: 0.00991 s [time ] cufinufft setNUpts: 0.000204 s [time ] cufinufft exec: 0.0129 s [time ] cufinufft destroy: 0.000194 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.32e+04 us, speed 1.29e+08 NUpts/s (exec-only thoughput: 2.32e+08 NU pts/s) ./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00115 s [time ] cufinufft plan: 0.000575 s [time ] cufinufft setNUpts: 0.000282 s [time ] cufinufft exec: 0.0241 s [time ] cufinufft destroy: 0.000121 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.5e+04 us, speed 1.19e+08 NUpts/s (exec-only thoughput: 1.24e+08 NU pts/s) So, the first task is for you to report the above tests on your hardware (eg, use FINUFFT_BUILD_TESTS=ON in cmake). Second task is for you to see why you're not getting the GPU thoughput that the tester reports. Is all your time taken in planning or setpts? FWIW a fancy desktop (16-core Xeon gold 6244) gets less than twice the laptop speed: OMP_NUM_THREADS=16 ./finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.0743 s 4.02e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.15e-07 Sometimes with CPU it's better to limit the threads to 1/core or even less. PS you can improve all the above timings a bit (maybe 40-70% faster) by going to single-precision. I just checked the GPU and it gets about 4e8 NU pts/s in single-prec. You should aim for this in your MRI application. Upshot is you should get 5-10x over decent CPU with a decent GPU. Does this help? Best, Alex On Fri, Dec 8, 2023 at 12:15 AM qinshucheng @.* *> wrote: … <#m_8921214255399623482_> Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem. … <#m_-1773919210984028173_> On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.> wrote: — Reply to this email directly, view it on GitHub <#385 <#385> <#385 <#385>>>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI <https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI> https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI <https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI> . You are receiving this because you are subscribed to this thread.Message ID: @.*> -- *-------------------------------------------------------------------^^._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942 *Here is my setup:* CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores GPU: GeForce RTX3090 24GB All runs on a centos7 system *Dimension of data:* [3458 36 24] <-> [256 256 24] *Time elapsed:* finufft cpu : 0.119s cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s) *The code:* int64_t m_iSamples = 3458; int64_t m_iSpokes = 36; int64_t m_iROSize = 256; int64_t m_iPESize = 256; int64_t m_iSPESize = 24; int64_t m_iCHSize = 1; std::shared_ptr KspRank(new CFloat[m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize]); CFloat* pKspSrc = KspRank.get(); std::shared_ptr ImgRank(new CFloat[m_iROSize * m_iPESize * m_iSPESize * m_iCHSize]); CFloat* pImgDes = ImgRank.get(); std::shared_ptr kTraj(new float[m_iSamples * m_iSpokes * 2]); float* m_iKtraj = kTraj.get(); // My pre-saved data ReadBin(sFilePathSubspace + "ksp_3458x36x24.bin", pKspSrc, m_iSamples * m_iSpokes * m_iSPESize); ReadBin(sFilePathSubspace + "traj_3458x36x2.bin", m_iKtraj, m_iSamples * m_iSpokes * 2); //[-pi, pi) for (int64_t ii = 1;ii < m_iCHSize;++ii) { memcpy(pKspSrc + ii * m_iSamples * m_iSpokes * m_iSPESize, pKspSrc, sizeof(CFloat) * m_iSamples * m_iSpokes * m_iSPESize); } int iNumThread = std::min((int)m_iSPESize * m_iCHSize, omp_get_max_threads()); // CPU boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time(); finufft_opts m_optsB; finufft_default_opts(&m_optsB); m_optsB.upsampfac = 2.0; m_optsB.nthreads = iNumThread; finufftf2d1many(m_iSPESize * m_iCHSize, m_iSamples * m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples * m_iSpokes, pKspSrc, 1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB); boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC; std::cout << "finufftf cpu Elapsed time= " << durationC << std::endl; // GPU float* m_d_x, * m_d_y; cuFloatComplex* m_d_c, * m_d_fk; cufinufftf_plan m_dplan; cudaMalloc(&m_d_x, m_iSamples * m_iSpokes * sizeof(float)); cudaMalloc(&m_d_y, m_iSamples * m_iSpokes * sizeof(float)); cudaMalloc(&m_d_c, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex)); cudaMalloc(&m_d_fk, m_iROSize * m_iPESize * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex)); cufinufft_opts m_gpuopts; cufinufft_default_opts(&m_gpuopts); m_gpuopts.gpu_method = 1; m_gpuopts.gpu_binsizex = 32; m_gpuopts.gpu_binsizey = 32; boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(m_d_x, m_iKtraj, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_y, m_iKtraj + m_iSamples * m_iSpokes, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_c, pKspSrc, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyHostToDevice); int64_t nmodes[3]; nmodes[0] = m_iROSize; nmodes[1] = m_iPESize; nmodes[2] = 1; int ier; boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1; std::cout << "cufinufftf HostToDevice Elapsed time= " << duration1 << std::endl; boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time(); ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize * m_iCHSize, 1e-6, &m_dplan, &m_gpuopts); ier = cufinufftf_setpts(m_dplan, m_iSamples * m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL); ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk); ier = cufinufftf_destroy(m_dplan); boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2; std::cout << "cufinufftf running Elapsed time= " << duration2 << std::endl; boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(pImgDes, m_d_fk, m_iROSize * m_iPESize * m_iSPESize* m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost); boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::local_time() - t3; std::cout << "cufinufftf DeviceToHost Elapsed time= " << duration3 << std::endl; — Reply to this email directly, view it on GitHub <[#385 (comment)]( #385 (comment))>, or unsubscribe < https://github.com/notifications/unsubscribe-auth/ACNZRSXALBUK2FNOPFJMTNLYIKO7TAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBWGU2TSMRTGI> . You are receiving this because you commented.Message ID: ***@***.***> -- *-------------------------------------------------------------------~^^._.' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942 — Reply to this email directly, view it on GitHub <#385 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ACNZRSUCUSGHQXPORSCR64LYI3PM7AVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBZG44DMNBRGE> . You are receiving this because you commented.Message ID: ***@***.***>

-- *-------------------------------------------------------------------~^`^~._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Why is my GPU version slower than CPU version #433

{{title}}

Replies: 6 comments

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

Select a reply

Why is my GPU version slower than CPU version #433

qinshucheng Dec 8, 2023

Replies: 6 comments

ahbarnett Dec 8, 2023 Maintainer

qinshucheng Dec 8, 2023 Author

ahbarnett Dec 8, 2023 Maintainer

ahbarnett Dec 8, 2023 Maintainer

qinshucheng Dec 11, 2023 Author

ahbarnett Dec 11, 2023 Maintainer

qinshucheng
Dec 8, 2023

ahbarnett
Dec 8, 2023
Maintainer

qinshucheng
Dec 8, 2023
Author

ahbarnett
Dec 8, 2023
Maintainer

ahbarnett
Dec 8, 2023
Maintainer

qinshucheng
Dec 11, 2023
Author

ahbarnett
Dec 11, 2023
Maintainer