diff --git a/include/zfp.h b/include/zfp.h index 9db4a3873..f92c0b436 100644 --- a/include/zfp.h +++ b/include/zfp.h @@ -12,6 +12,10 @@ #include "zfp/internal/zfp/system.h" #include "zfp/internal/zfp/types.h" +#ifdef ZFP_WITH_CUDA +#include +#endif + /* macros ------------------------------------------------------------------ */ /* default compression parameters */ @@ -133,6 +137,11 @@ typedef struct { size_t nx, ny, nz, nw; /* sizes (zero for unused dimensions) */ ptrdiff_t sx, sy, sz, sw; /* strides (zero for contiguous array a[nw][nz][ny][nx]) */ void* data; /* pointer to array data */ + +#ifdef ZFP_WITH_CUDA + cudaStream_t cuStream; /* Provision to execute in stream */ +#endif + } zfp_field; #ifdef __cplusplus @@ -580,6 +589,12 @@ zfp_field_set_metadata( uint64 meta /* compact 52-bit encoding of metadata */ ); +/* Set cuda stream in case of strem execution */ + +#ifdef ZFP_WITH_CUDA +void zfp_field_set_cuda_stream(zfp_field* field, cudaStream_t custream); +#endif + /* high-level API: compression and decompression --------------------------- */ /* compress entire field (nonzero return value upon success) */ diff --git a/src/cuda_zfp/cuZFP.cu b/src/cuda_zfp/cuZFP.cu index e1de467ff..f9bbb4ef8 100644 --- a/src/cuda_zfp/cuZFP.cu +++ b/src/cuda_zfp/cuZFP.cu @@ -98,7 +98,7 @@ bool is_contigous(const uint dims[3], const int3 &stride, long long int &offset) // encode expects device pointers // template -size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_stream) +size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_stream, cudaStream_t custream) { int d = 0; @@ -118,7 +118,7 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_ { int dim = dims[0]; int sx = stride.x; - stream_size = cuZFP::encode1(dim, sx, d_data, d_stream, bits_per_block); + stream_size = cuZFP::encode1(dim, sx, d_data, d_stream, bits_per_block, custream); } else if(d == 2) { @@ -126,7 +126,7 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_ int2 s; s.x = stride.x; s.y = stride.y; - stream_size = cuZFP::encode2(ndims, s, d_data, d_stream, bits_per_block); + stream_size = cuZFP::encode2(ndims, s, d_data, d_stream, bits_per_block, custream); } else if(d == 3) { @@ -135,7 +135,7 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_ s.y = stride.y; s.z = stride.z; uint3 ndims = make_uint3(dims[0], dims[1], dims[2]); - stream_size = cuZFP::encode(ndims, s, d_data, d_stream, bits_per_block); + stream_size = cuZFP::encode3(ndims, s, d_data, d_stream, bits_per_block, custream); } errors.chk("Encode"); @@ -144,9 +144,11 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_ } template -size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *out) +size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *out, cudaStream_t custream) { + /* Include CUDA stream in decode call */ + int d = 0; size_t out_size = 1; size_t stream_bytes = 0; @@ -168,14 +170,14 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o s.y = stride.y; s.z = stride.z; - stream_bytes = cuZFP::decode3(dims, s, stream, out, bits_per_block); + stream_bytes = cuZFP::decode3(dims, s, stream, out, bits_per_block, custream); } else if(d == 1) { uint dim = ndims[0]; int sx = stride.x; - stream_bytes = cuZFP::decode1(dim, sx, stream, out, bits_per_block); + stream_bytes = cuZFP::decode1(dim, sx, stream, out, bits_per_block, custream); } else if(d == 2) @@ -188,7 +190,7 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o s.x = stride.x; s.y = stride.y; - stream_bytes = cuZFP::decode2(dims, s, stream, out, bits_per_block); + stream_bytes = cuZFP::decode2(dims, s, stream, out, bits_per_block, custream); } else std::cerr<<" d == "<cuStream); return d_stream; } @@ -224,8 +226,11 @@ Word *setup_device_stream_decompress(zfp_stream *stream,const zfp_field *field) Word *d_stream = NULL; //TODO: change maximum_size to compressed stream size size_t size = zfp_stream_maximum_size(stream, field); - cudaMalloc(&d_stream, size); - cudaMemcpy(d_stream, stream->stream->begin, size, cudaMemcpyHostToDevice); + + /* Allocate memory per CUDA stream */ + + cudaMallocAsync(&d_stream, size, field->cuStream); + cudaMemcpyAsync(d_stream, stream->stream->begin, size, cudaMemcpyHostToDevice, field->cuStream); return d_stream; } @@ -289,9 +294,11 @@ void *setup_device_field_compress(const zfp_field *field, const int3 &stride, lo if(contig) { size_t field_bytes = type_size * field_size; - cudaMalloc(&d_data, field_bytes); + /* allocate memory async per stream */ - cudaMemcpy(d_data, host_ptr, field_bytes, cudaMemcpyHostToDevice); + cudaMallocAsync(&d_data, field_bytes, field->cuStream); + + cudaMemcpyAsync(d_data, host_ptr, field_bytes, cudaMemcpyHostToDevice, field->cuStream); } return offset_void(field->type, d_data, -offset); } @@ -328,12 +335,16 @@ void *setup_device_field_decompress(const zfp_field *field, const int3 &stride, if(contig) { size_t field_bytes = type_size * field_size; - cudaMalloc(&d_data, field_bytes); + /* Allocate GPU memory per CUDA stream */ + + cudaMallocAsync(&d_data, field_bytes, field->cuStream); } return offset_void(field->type, d_data, -offset); } -void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int offset, zfp_type type) +/* CUDA stream is assigned in the device cleanup */ + +void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int offset, zfp_type type, cudaStream_t custream) { bool device = cuZFP::is_gpu_ptr(orig_ptr); if(device) @@ -346,10 +357,10 @@ void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int if(bytes > 0) { - cudaMemcpy(h_offset_ptr, d_offset_ptr, bytes, cudaMemcpyDeviceToHost); + cudaMemcpyAsync(h_offset_ptr, d_offset_ptr, bytes, cudaMemcpyDeviceToHost, custream); } - cudaFree(d_offset_ptr); + cudaFreeAsync(d_offset_ptr, custream); } } // namespace internal @@ -366,6 +377,10 @@ cuda_compress(zfp_stream *stream, const zfp_field *field) stride.x = field->sx ? field->sx : 1; stride.y = field->sy ? field->sy : field->nx; stride.z = field->sz ? field->sz : field->nx * field->ny; + + /* CUDA stream implementation */ + + cudaStream_t cudastream = field->cuStream; size_t stream_bytes = 0; long long int offset = 0; @@ -382,26 +397,26 @@ cuda_compress(zfp_stream *stream, const zfp_field *field) if(field->type == zfp_type_float) { float* data = (float*) d_data; - stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream); + stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream, cudastream); } else if(field->type == zfp_type_double) { double* data = (double*) d_data; - stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream); + stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream, cudastream); } else if(field->type == zfp_type_int32) { int * data = (int*) d_data; - stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream); + stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream, cudastream); } else if(field->type == zfp_type_int64) { long long int * data = (long long int*) d_data; - stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream); + stream_bytes = internal::encode(dims, stride, (int)stream->maxbits, data, d_stream, cudastream); } - internal::cleanup_device_ptr(stream->stream->begin, d_stream, stream_bytes, 0, field->type); - internal::cleanup_device_ptr(field->data, d_data, 0, offset, field->type); + internal::cleanup_device_ptr(stream->stream->begin, d_stream, stream_bytes, 0, field->type, cudastream); + internal::cleanup_device_ptr(field->data, d_data, 0, offset, field->type, cudastream); // zfp wants to flush the stream. // set bits to wsize because we already did that. @@ -436,30 +451,34 @@ cuda_decompress(zfp_stream *stream, zfp_field *field) return; } + /* Include CUDA Stream */ + + cudaStream_t cudastream = field->cuStream; + Word *d_stream = internal::setup_device_stream_decompress(stream, field); if(field->type == zfp_type_float) { float *data = (float*) d_data; - decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream); d_data = (void*) data; } else if(field->type == zfp_type_double) { double *data = (double*) d_data; - decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream); d_data = (void*) data; } else if(field->type == zfp_type_int32) { int *data = (int*) d_data; - decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream); d_data = (void*) data; } else if(field->type == zfp_type_int64) { long long int *data = (long long int*) d_data; - decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream); d_data = (void*) data; } else @@ -480,8 +499,8 @@ cuda_decompress(zfp_stream *stream, zfp_field *field) } size_t bytes = type_size * field_size; - internal::cleanup_device_ptr(stream->stream->begin, d_stream, 0, 0, field->type); - internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type); + internal::cleanup_device_ptr(stream->stream->begin, d_stream, 0, 0, field->type, cudastream); + internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type, cudastream); // this is how zfp determines if this was a success size_t words_read = decoded_bytes / sizeof(Word); diff --git a/src/cuda_zfp/decode1.cuh b/src/cuda_zfp/decode1.cuh index 6d357f631..5246471d4 100644 --- a/src/cuda_zfp/decode1.cuh +++ b/src/cuda_zfp/decode1.cuh @@ -81,7 +81,8 @@ size_t decode1launch(uint dim, int stride, Word *stream, Scalar *d_data, - uint maxbits) + uint maxbits, + cudaStream_t custream) { const int cuda_block_size = 128; @@ -110,10 +111,10 @@ size_t decode1launch(uint dim, cudaEventCreate(&start); cudaEventCreate(&stop); - cudaEventRecord(start); + cudaEventRecord(start, custream); #endif - cudaDecode1 << < grid_size, block_size >> > + cudaDecode1 << < grid_size, block_size, 0, custream >> > (stream, d_data, dim, @@ -123,9 +124,9 @@ size_t decode1launch(uint dim, maxbits); #ifdef CUDA_ZFP_RATE_PRINT - cudaEventRecord(stop); + cudaEventRecord(stop, custream); cudaEventSynchronize(stop); - cudaStreamSynchronize(0); + cudaStreamSynchronize(custream); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); @@ -145,9 +146,10 @@ size_t decode1(int dim, int stride, Word *stream, Scalar *d_data, - uint maxbits) + uint maxbits, + cudaStream_t custream) { - return decode1launch(dim, stride, stream, d_data, maxbits); + return decode1launch(dim, stride, stream, d_data, maxbits, custream); } } // namespace cuZFP diff --git a/src/cuda_zfp/decode2.cuh b/src/cuda_zfp/decode2.cuh index fa60a82f7..c51fcee30 100644 --- a/src/cuda_zfp/decode2.cuh +++ b/src/cuda_zfp/decode2.cuh @@ -99,7 +99,8 @@ size_t decode2launch(uint2 dims, int2 stride, Word *stream, Scalar *d_data, - uint maxbits) + uint maxbits, + cudaStream_t custream) { const int cuda_block_size = 128; dim3 block_size; @@ -134,10 +135,10 @@ size_t decode2launch(uint2 dims, cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); - cudaEventRecord(start); + cudaEventRecord(start, custream); #endif - cudaDecode2 << < grid_size, block_size >> > + cudaDecode2 << < grid_size, block_size, 0, custream >> > (stream, d_data, dims, @@ -146,9 +147,9 @@ size_t decode2launch(uint2 dims, maxbits); #ifdef CUDA_ZFP_RATE_PRINT - cudaEventRecord(stop); + cudaEventRecord(stop, custream); cudaEventSynchronize(stop); - cudaStreamSynchronize(0); + cudaStreamSynchronize(custream); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); @@ -168,9 +169,10 @@ size_t decode2(uint2 dims, int2 stride, Word *stream, Scalar *d_data, - uint maxbits) + uint maxbits, + cudaStream_t custream) { - return decode2launch(dims, stride, stream, d_data, maxbits); + return decode2launch(dims, stride, stream, d_data, maxbits, custream); } } // namespace cuZFP diff --git a/src/cuda_zfp/decode3.cuh b/src/cuda_zfp/decode3.cuh index 9f2a98a89..1b4264d6f 100644 --- a/src/cuda_zfp/decode3.cuh +++ b/src/cuda_zfp/decode3.cuh @@ -110,7 +110,8 @@ size_t decode3launch(uint3 dims, int3 stride, Word *stream, Scalar *d_data, - uint maxbits) + uint maxbits, + cudaStream_t custream) { const int cuda_block_size = 128; dim3 block_size; @@ -147,10 +148,10 @@ size_t decode3launch(uint3 dims, cudaEventCreate(&start); cudaEventCreate(&stop); - cudaEventRecord(start); + cudaEventRecord(start, custream); #endif - cudaDecode3 << < grid_size, block_size >> > + cudaDecode3 <<< grid_size, block_size, 0, custream >>> (stream, d_data, dims, @@ -159,9 +160,9 @@ size_t decode3launch(uint3 dims, maxbits); #ifdef CUDA_ZFP_RATE_PRINT - cudaEventRecord(stop); + cudaEventRecord(stop, custream); cudaEventSynchronize(stop); - cudaStreamSynchronize(0); + cudaStreamSynchronize(custream); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); @@ -182,9 +183,10 @@ size_t decode3(uint3 dims, int3 stride, Word *stream, Scalar *d_data, - uint maxbits) + uint maxbits, + cudaStream_t custream) { - return decode3launch(dims, stride, stream, d_data, maxbits); + return decode3launch(dims, stride, stream, d_data, maxbits, custream); } } // namespace cuZFP diff --git a/src/cuda_zfp/encode1.cuh b/src/cuda_zfp/encode1.cuh index 98ce5a753..6e8ae70af 100644 --- a/src/cuda_zfp/encode1.cuh +++ b/src/cuda_zfp/encode1.cuh @@ -95,7 +95,8 @@ size_t encode1launch(uint dim, int sx, const Scalar *d_data, Word *stream, - const int maxbits) + const int maxbits, + cudaStream_t custream) { const int cuda_block_size = 128; dim3 block_size = dim3(cuda_block_size, 1, 1); @@ -121,17 +122,17 @@ size_t encode1launch(uint dim, // size_t stream_bytes = calc_device_mem1d(zfp_pad, maxbits); // ensure we have zeros - cudaMemset(stream, 0, stream_bytes); + cudaMemsetAsync(stream, 0, stream_bytes, custream); #ifdef CUDA_ZFP_RATE_PRINT cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); - cudaEventRecord(start); + cudaEventRecord(start, custream); #endif - cudaEncode1 <<>> + cudaEncode1 <<>> (maxbits, d_data, stream, @@ -141,9 +142,9 @@ size_t encode1launch(uint dim, zfp_blocks); #ifdef CUDA_ZFP_RATE_PRINT - cudaEventRecord(stop); + cudaEventRecord(stop, custream); cudaEventSynchronize(stop); - cudaStreamSynchronize(0); + cudaStreamSynchronize(custream); float milliseconds = 0.f; cudaEventElapsedTime(&milliseconds, start, stop); @@ -164,9 +165,10 @@ size_t encode1(int dim, int sx, Scalar *d_data, Word *stream, - const int maxbits) + const int maxbits, + cudaStream_t custream) { - return encode1launch(dim, sx, d_data, stream, maxbits); + return encode1launch(dim, sx, d_data, stream, maxbits, custream); } } diff --git a/src/cuda_zfp/encode2.cuh b/src/cuda_zfp/encode2.cuh index 0d577d51e..3c3a96fbf 100644 --- a/src/cuda_zfp/encode2.cuh +++ b/src/cuda_zfp/encode2.cuh @@ -111,7 +111,8 @@ size_t encode2launch(uint2 dims, int2 stride, const Scalar *d_data, Word *stream, - const int maxbits) + const int maxbits, + cudaStream_t custream) { const int cuda_block_size = 128; dim3 block_size = dim3(cuda_block_size, 1, 1); @@ -139,16 +140,16 @@ size_t encode2launch(uint2 dims, // size_t stream_bytes = calc_device_mem2d(zfp_pad, maxbits); // ensure we have zeros - cudaMemset(stream, 0, stream_bytes); + cudaMemsetAsync(stream, 0, stream_bytes, custream); #ifdef CUDA_ZFP_RATE_PRINT cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); - cudaEventRecord(start); + cudaEventRecord(start, custream); #endif - cudaEncode2 <<>> + cudaEncode2 <<>> (maxbits, d_data, stream, @@ -158,10 +159,11 @@ size_t encode2launch(uint2 dims, zfp_blocks); #ifdef CUDA_ZFP_RATE_PRINT - cudaDeviceSynchronize(); - cudaEventRecord(stop); + // Check if the following Device Synchronize is necessary? + // cudaDeviceSynchronize(); + cudaEventRecord(stop, custream); cudaEventSynchronize(stop); - cudaStreamSynchronize(0); + cudaStreamSynchronize(custream); float milliseconds = 0.f; cudaEventElapsedTime(&milliseconds, start, stop); @@ -179,9 +181,10 @@ size_t encode2(uint2 dims, int2 stride, Scalar *d_data, Word *stream, - const int maxbits) + const int maxbits, + cudaStream_t custream) { - return encode2launch(dims, stride, d_data, stream, maxbits); + return encode2launch(dims, stride, d_data, stream, maxbits, custream); } } diff --git a/src/cuda_zfp/encode3.cuh b/src/cuda_zfp/encode3.cuh index 1edee9e99..8a507c67e 100644 --- a/src/cuda_zfp/encode3.cuh +++ b/src/cuda_zfp/encode3.cuh @@ -119,7 +119,8 @@ size_t encode3launch(uint3 dims, int3 stride, const Scalar *d_data, Word *stream, - const int maxbits) + const int maxbits, + cudaStream_t custream) { const int cuda_block_size = 128; @@ -148,16 +149,16 @@ size_t encode3launch(uint3 dims, size_t stream_bytes = calc_device_mem3d(zfp_pad, maxbits); //ensure we start with 0s - cudaMemset(stream, 0, stream_bytes); + cudaMemsetAsync(stream, 0, stream_bytes, custream); #ifdef CUDA_ZFP_RATE_PRINT cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); - cudaEventRecord(start); + cudaEventRecord(start, custream); #endif - cudaEncode <<>> + cudaEncode <<>> (maxbits, d_data, stream, @@ -167,9 +168,9 @@ size_t encode3launch(uint3 dims, zfp_blocks); #ifdef CUDA_ZFP_RATE_PRINT - cudaEventRecord(stop); + cudaEventRecord(stop, custream); cudaEventSynchronize(stop); - cudaStreamSynchronize(0); + cudaStreamSynchronize(custream); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); @@ -187,14 +188,17 @@ size_t encode3launch(uint3 dims, // // Just pass the raw pointer to the "real" encode // +// Modifiled - Function name should be encode3, as it is followed in 1, 2 - D cases + template -size_t encode(uint3 dims, +size_t encode3(uint3 dims, int3 stride, Scalar *d_data, Word *stream, - const int bits_per_block) + const int bits_per_block, + cudaStream_t custream) { - return encode3launch(dims, stride, d_data, stream, bits_per_block); + return encode3launch(dims, stride, d_data, stream, bits_per_block, custream); } } diff --git a/src/zfp.c b/src/zfp.c index a498a985f..d46bdb452 100644 --- a/src/zfp.c +++ b/src/zfp.c @@ -113,6 +113,10 @@ zfp_field_alloc() field->nx = field->ny = field->nz = field->nw = 0; field->sx = field->sy = field->sz = field->sw = 0; field->data = 0; + +#ifdef ZFP_WITH_CUDA + field->cuStream = 0; /* set to default stream */ +#endif } return field; } @@ -335,6 +339,14 @@ zfp_field_set_pointer(zfp_field* field, void* data) field->data = data; } +#ifdef ZFP_WITH_CUDA +void +zfp_field_set_cuda_stream(zfp_field* field, cudaStream_t custream) +{ + field->cuStream = custream; +} +#endif + zfp_type zfp_field_set_type(zfp_field* field, zfp_type type) {