diff --git a/include/zfp.h b/include/zfp.h
index 9db4a3873..f92c0b436 100644
--- a/include/zfp.h
+++ b/include/zfp.h
@@ -12,6 +12,10 @@
 #include "zfp/internal/zfp/system.h"
 #include "zfp/internal/zfp/types.h"
 
+#ifdef ZFP_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
 /* macros ------------------------------------------------------------------ */
 
 /* default compression parameters */
@@ -133,6 +137,11 @@ typedef struct {
   size_t nx, ny, nz, nw;    /* sizes (zero for unused dimensions) */
   ptrdiff_t sx, sy, sz, sw; /* strides (zero for contiguous array a[nw][nz][ny][nx]) */
   void* data;               /* pointer to array data */
+
+#ifdef ZFP_WITH_CUDA
+  cudaStream_t cuStream; /* Provision to execute in stream */
+#endif
+
 } zfp_field;
 
 #ifdef __cplusplus
@@ -580,6 +589,12 @@ zfp_field_set_metadata(
   uint64 meta       /* compact 52-bit encoding of metadata */
 );
 
+/* Set cuda stream in case of strem execution */
+
+#ifdef ZFP_WITH_CUDA
+void zfp_field_set_cuda_stream(zfp_field* field, cudaStream_t custream);
+#endif
+
 /* high-level API: compression and decompression --------------------------- */
 
 /* compress entire field (nonzero return value upon success) */
diff --git a/src/cuda_zfp/cuZFP.cu b/src/cuda_zfp/cuZFP.cu
index e1de467ff..f9bbb4ef8 100644
--- a/src/cuda_zfp/cuZFP.cu
+++ b/src/cuda_zfp/cuZFP.cu
@@ -98,7 +98,7 @@ bool is_contigous(const uint dims[3], const int3 &stride, long long int &offset)
 // encode expects device pointers
 //
 template<typename T>
-size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_stream)
+size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_stream, cudaStream_t custream)
 {
 
   int d = 0;
@@ -118,7 +118,7 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
   {
     int dim = dims[0];
     int sx = stride.x;
-    stream_size = cuZFP::encode1<T>(dim, sx, d_data, d_stream, bits_per_block); 
+    stream_size = cuZFP::encode1<T>(dim, sx, d_data, d_stream, bits_per_block, custream); 
   }
   else if(d == 2)
   {
@@ -126,7 +126,7 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
     int2 s;
     s.x = stride.x; 
     s.y = stride.y; 
-    stream_size = cuZFP::encode2<T>(ndims, s, d_data, d_stream, bits_per_block); 
+    stream_size = cuZFP::encode2<T>(ndims, s, d_data, d_stream, bits_per_block, custream); 
   }
   else if(d == 3)
   {
@@ -135,7 +135,7 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
     s.y = stride.y; 
     s.z = stride.z; 
     uint3 ndims = make_uint3(dims[0], dims[1], dims[2]);
-    stream_size = cuZFP::encode<T>(ndims, s, d_data, d_stream, bits_per_block); 
+    stream_size = cuZFP::encode3<T>(ndims, s, d_data, d_stream, bits_per_block, custream); 
   }
 
   errors.chk("Encode");
@@ -144,9 +144,11 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
 }
 
 template<typename T>
-size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *out)
+size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *out, cudaStream_t custream)
 {
 
+  /* Include CUDA stream in decode call */
+
   int d = 0;
   size_t out_size = 1;
   size_t stream_bytes = 0;
@@ -168,14 +170,14 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
     s.y = stride.y; 
     s.z = stride.z; 
 
-    stream_bytes = cuZFP::decode3<T>(dims, s, stream, out, bits_per_block); 
+    stream_bytes = cuZFP::decode3<T>(dims, s, stream, out, bits_per_block, custream); 
   }
   else if(d == 1)
   {
     uint dim = ndims[0];
     int sx = stride.x;
 
-    stream_bytes = cuZFP::decode1<T>(dim, sx, stream, out, bits_per_block); 
+    stream_bytes = cuZFP::decode1<T>(dim, sx, stream, out, bits_per_block, custream); 
 
   }
   else if(d == 2)
@@ -188,7 +190,7 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
     s.x = stride.x; 
     s.y = stride.y; 
 
-    stream_bytes = cuZFP::decode2<T>(dims, s, stream, out, bits_per_block); 
+    stream_bytes = cuZFP::decode2<T>(dims, s, stream, out, bits_per_block, custream); 
   }
   else std::cerr<<" d ==  "<<d<<" not implemented\n";
  
@@ -207,7 +209,7 @@ Word *setup_device_stream_compress(zfp_stream *stream,const zfp_field *field)
 
   Word *d_stream = NULL;
   size_t max_size = zfp_stream_maximum_size(stream, field);
-  cudaMalloc(&d_stream, max_size);
+  cudaMallocAsync(&d_stream, max_size, field->cuStream);
   return d_stream;
 }
 
@@ -224,8 +226,11 @@ Word *setup_device_stream_decompress(zfp_stream *stream,const zfp_field *field)
   Word *d_stream = NULL;
   //TODO: change maximum_size to compressed stream size
   size_t size = zfp_stream_maximum_size(stream, field);
-  cudaMalloc(&d_stream, size);
-  cudaMemcpy(d_stream, stream->stream->begin, size, cudaMemcpyHostToDevice);
+
+  /* Allocate memory per CUDA stream */
+
+  cudaMallocAsync(&d_stream, size, field->cuStream);
+  cudaMemcpyAsync(d_stream, stream->stream->begin, size, cudaMemcpyHostToDevice, field->cuStream);
   return d_stream;
 }
 
@@ -289,9 +294,11 @@ void *setup_device_field_compress(const zfp_field *field, const int3 &stride, lo
   if(contig)
   {
     size_t field_bytes = type_size * field_size;
-    cudaMalloc(&d_data, field_bytes);
+    /* allocate memory async per stream */
 
-    cudaMemcpy(d_data, host_ptr, field_bytes, cudaMemcpyHostToDevice);
+    cudaMallocAsync(&d_data, field_bytes, field->cuStream);
+
+    cudaMemcpyAsync(d_data, host_ptr, field_bytes, cudaMemcpyHostToDevice, field->cuStream);
   }
   return offset_void(field->type, d_data, -offset);
 }
@@ -328,12 +335,16 @@ void *setup_device_field_decompress(const zfp_field *field, const int3 &stride,
   if(contig)
   {
     size_t field_bytes = type_size * field_size;
-    cudaMalloc(&d_data, field_bytes);
+    /* Allocate GPU memory per CUDA stream */
+
+    cudaMallocAsync(&d_data, field_bytes, field->cuStream);
   }
   return offset_void(field->type, d_data, -offset);
 }
 
-void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int offset, zfp_type type)
+/* CUDA stream is assigned in the device cleanup */
+
+void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int offset, zfp_type type, cudaStream_t custream)
 {
   bool device = cuZFP::is_gpu_ptr(orig_ptr);
   if(device)
@@ -346,10 +357,10 @@ void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int
 
   if(bytes > 0)
   {
-    cudaMemcpy(h_offset_ptr, d_offset_ptr, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpyAsync(h_offset_ptr, d_offset_ptr, bytes, cudaMemcpyDeviceToHost, custream);
   }
 
-  cudaFree(d_offset_ptr);
+  cudaFreeAsync(d_offset_ptr, custream);
 }
 
 } // namespace internal
@@ -366,6 +377,10 @@ cuda_compress(zfp_stream *stream, const zfp_field *field)
   stride.x = field->sx ? field->sx : 1;
   stride.y = field->sy ? field->sy : field->nx;
   stride.z = field->sz ? field->sz : field->nx * field->ny;
+
+  /* CUDA stream implementation */
+
+  cudaStream_t cudastream = field->cuStream;
   
   size_t stream_bytes = 0;
   long long int offset = 0; 
@@ -382,26 +397,26 @@ cuda_compress(zfp_stream *stream, const zfp_field *field)
   if(field->type == zfp_type_float)
   {
     float* data = (float*) d_data;
-    stream_bytes = internal::encode<float>(dims, stride, (int)stream->maxbits, data, d_stream);
+    stream_bytes = internal::encode<float>(dims, stride, (int)stream->maxbits, data, d_stream, cudastream);
   }
   else if(field->type == zfp_type_double)
   {
     double* data = (double*) d_data;
-    stream_bytes = internal::encode<double>(dims, stride, (int)stream->maxbits, data, d_stream);
+    stream_bytes = internal::encode<double>(dims, stride, (int)stream->maxbits, data, d_stream, cudastream);
   }
   else if(field->type == zfp_type_int32)
   {
     int * data = (int*) d_data;
-    stream_bytes = internal::encode<int>(dims, stride, (int)stream->maxbits, data, d_stream);
+    stream_bytes = internal::encode<int>(dims, stride, (int)stream->maxbits, data, d_stream, cudastream);
   }
   else if(field->type == zfp_type_int64)
   {
     long long int * data = (long long int*) d_data;
-    stream_bytes = internal::encode<long long int>(dims, stride, (int)stream->maxbits, data, d_stream);
+    stream_bytes = internal::encode<long long int>(dims, stride, (int)stream->maxbits, data, d_stream, cudastream);
   }
 
-  internal::cleanup_device_ptr(stream->stream->begin, d_stream, stream_bytes, 0, field->type);
-  internal::cleanup_device_ptr(field->data, d_data, 0, offset, field->type);
+  internal::cleanup_device_ptr(stream->stream->begin, d_stream, stream_bytes, 0, field->type, cudastream);
+  internal::cleanup_device_ptr(field->data, d_data, 0, offset, field->type, cudastream);
 
   // zfp wants to flush the stream.
   // set bits to wsize because we already did that.
@@ -436,30 +451,34 @@ cuda_decompress(zfp_stream *stream, zfp_field *field)
     return;
   }
 
+  /* Include CUDA Stream */
+
+  cudaStream_t cudastream = field->cuStream;
+
   Word *d_stream = internal::setup_device_stream_decompress(stream, field);
 
   if(field->type == zfp_type_float)
   {
     float *data = (float*) d_data;
-    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream);
     d_data = (void*) data;
   }
   else if(field->type == zfp_type_double)
   {
     double *data = (double*) d_data;
-    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream);
     d_data = (void*) data;
   }
   else if(field->type == zfp_type_int32)
   {
     int *data = (int*) d_data;
-    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream);
     d_data = (void*) data;
   }
   else if(field->type == zfp_type_int64)
   {
     long long int *data = (long long int*) d_data;
-    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data, cudastream);
     d_data = (void*) data;
   }
   else
@@ -480,8 +499,8 @@ cuda_decompress(zfp_stream *stream, zfp_field *field)
   }
   
   size_t bytes = type_size * field_size;
-  internal::cleanup_device_ptr(stream->stream->begin, d_stream, 0, 0, field->type);
-  internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type);
+  internal::cleanup_device_ptr(stream->stream->begin, d_stream, 0, 0, field->type, cudastream);
+  internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type, cudastream);
   
   // this is how zfp determines if this was a success
   size_t words_read = decoded_bytes / sizeof(Word);
diff --git a/src/cuda_zfp/decode1.cuh b/src/cuda_zfp/decode1.cuh
index 6d357f631..5246471d4 100644
--- a/src/cuda_zfp/decode1.cuh
+++ b/src/cuda_zfp/decode1.cuh
@@ -81,7 +81,8 @@ size_t decode1launch(uint dim,
                      int stride,
                      Word *stream,
                      Scalar *d_data,
-                     uint maxbits)
+                     uint maxbits,
+                     cudaStream_t custream)
 {
   const int cuda_block_size = 128;
 
@@ -110,10 +111,10 @@ size_t decode1launch(uint dim,
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  cudaEventRecord(start);
+  cudaEventRecord(start, custream);
 #endif
 
-  cudaDecode1<Scalar> << < grid_size, block_size >> >
+  cudaDecode1<Scalar> << < grid_size, block_size, 0, custream >> >
     (stream,
 		 d_data,
      dim,
@@ -123,9 +124,9 @@ size_t decode1launch(uint dim,
      maxbits);
 
 #ifdef CUDA_ZFP_RATE_PRINT
-  cudaEventRecord(stop);
+  cudaEventRecord(stop, custream);
   cudaEventSynchronize(stop);
-	cudaStreamSynchronize(0);
+	cudaStreamSynchronize(custream);
 
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);
@@ -145,9 +146,10 @@ size_t decode1(int dim,
                int stride,
                Word *stream,
                Scalar *d_data,
-               uint maxbits)
+               uint maxbits,
+               cudaStream_t custream)
 {
-	return decode1launch<Scalar>(dim, stride, stream, d_data, maxbits);
+	return decode1launch<Scalar>(dim, stride, stream, d_data, maxbits, custream);
 }
 
 } // namespace cuZFP
diff --git a/src/cuda_zfp/decode2.cuh b/src/cuda_zfp/decode2.cuh
index fa60a82f7..c51fcee30 100644
--- a/src/cuda_zfp/decode2.cuh
+++ b/src/cuda_zfp/decode2.cuh
@@ -99,7 +99,8 @@ size_t decode2launch(uint2 dims,
                      int2 stride,
                      Word *stream,
                      Scalar *d_data,
-                     uint maxbits)
+                     uint maxbits,
+                     cudaStream_t custream)
 {
   const int cuda_block_size = 128;
   dim3 block_size;
@@ -134,10 +135,10 @@ size_t decode2launch(uint2 dims,
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
-  cudaEventRecord(start);
+  cudaEventRecord(start, custream);
 #endif
 
-  cudaDecode2<Scalar, 16> << < grid_size, block_size >> >
+  cudaDecode2<Scalar, 16> << < grid_size, block_size, 0, custream >> >
     (stream,
 		 d_data,
      dims,
@@ -146,9 +147,9 @@ size_t decode2launch(uint2 dims,
      maxbits);
 
 #ifdef CUDA_ZFP_RATE_PRINT
-  cudaEventRecord(stop);
+  cudaEventRecord(stop, custream);
   cudaEventSynchronize(stop);
-	cudaStreamSynchronize(0);
+	cudaStreamSynchronize(custream);
 
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);
@@ -168,9 +169,10 @@ size_t decode2(uint2 dims,
                int2 stride,
                Word *stream,
                Scalar *d_data,
-               uint maxbits)
+               uint maxbits,
+               cudaStream_t custream)
 {
-	return decode2launch<Scalar>(dims, stride, stream, d_data, maxbits);
+	return decode2launch<Scalar>(dims, stride, stream, d_data, maxbits, custream);
 }
 
 } // namespace cuZFP
diff --git a/src/cuda_zfp/decode3.cuh b/src/cuda_zfp/decode3.cuh
index 9f2a98a89..1b4264d6f 100644
--- a/src/cuda_zfp/decode3.cuh
+++ b/src/cuda_zfp/decode3.cuh
@@ -110,7 +110,8 @@ size_t decode3launch(uint3 dims,
                      int3 stride,
                      Word *stream,
                      Scalar *d_data,
-                     uint maxbits)
+                     uint maxbits,
+                     cudaStream_t custream)
 {
   const int cuda_block_size = 128;
   dim3 block_size;
@@ -147,10 +148,10 @@ size_t decode3launch(uint3 dims,
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  cudaEventRecord(start);
+  cudaEventRecord(start, custream);
 #endif
 
-  cudaDecode3<Scalar, 64> << < grid_size, block_size >> >
+  cudaDecode3<Scalar, 64> <<< grid_size, block_size, 0, custream >>>
     (stream,
 		 d_data,
      dims,
@@ -159,9 +160,9 @@ size_t decode3launch(uint3 dims,
      maxbits);
 
 #ifdef CUDA_ZFP_RATE_PRINT
-  cudaEventRecord(stop);
+  cudaEventRecord(stop, custream);
   cudaEventSynchronize(stop);
-	cudaStreamSynchronize(0);
+	cudaStreamSynchronize(custream);
 
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);
@@ -182,9 +183,10 @@ size_t decode3(uint3 dims,
                int3 stride,
                Word  *stream,
                Scalar *d_data,
-               uint maxbits)
+               uint maxbits,
+               cudaStream_t custream)
 {
-	return decode3launch<Scalar>(dims, stride, stream, d_data, maxbits);
+	return decode3launch<Scalar>(dims, stride, stream, d_data, maxbits, custream);
 }
 
 } // namespace cuZFP
diff --git a/src/cuda_zfp/encode1.cuh b/src/cuda_zfp/encode1.cuh
index 98ce5a753..6e8ae70af 100644
--- a/src/cuda_zfp/encode1.cuh
+++ b/src/cuda_zfp/encode1.cuh
@@ -95,7 +95,8 @@ size_t encode1launch(uint dim,
                      int sx,
                      const Scalar *d_data,
                      Word *stream,
-                     const int maxbits)
+                     const int maxbits,
+                     cudaStream_t custream)
 {
   const int cuda_block_size = 128;
   dim3 block_size = dim3(cuda_block_size, 1, 1);
@@ -121,17 +122,17 @@ size_t encode1launch(uint dim,
   //
   size_t stream_bytes = calc_device_mem1d(zfp_pad, maxbits);
   // ensure we have zeros
-  cudaMemset(stream, 0, stream_bytes);
+  cudaMemsetAsync(stream, 0, stream_bytes, custream);
 
 #ifdef CUDA_ZFP_RATE_PRINT
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  cudaEventRecord(start);
+  cudaEventRecord(start, custream);
 #endif
 
-  cudaEncode1<Scalar> <<<grid_size, block_size>>>
+  cudaEncode1<Scalar> <<<grid_size, block_size, 0, custream>>>
     (maxbits,
      d_data,
      stream,
@@ -141,9 +142,9 @@ size_t encode1launch(uint dim,
      zfp_blocks);
 
 #ifdef CUDA_ZFP_RATE_PRINT
-  cudaEventRecord(stop);
+  cudaEventRecord(stop, custream);
   cudaEventSynchronize(stop);
-  cudaStreamSynchronize(0);
+  cudaStreamSynchronize(custream);
 
   float milliseconds = 0.f;
   cudaEventElapsedTime(&milliseconds, start, stop);
@@ -164,9 +165,10 @@ size_t encode1(int dim,
                int sx,
                Scalar *d_data,
                Word *stream,
-               const int maxbits)
+               const int maxbits,
+               cudaStream_t custream)
 {
-  return encode1launch<Scalar>(dim, sx, d_data, stream, maxbits);
+  return encode1launch<Scalar>(dim, sx, d_data, stream, maxbits, custream);
 }
 
 }
diff --git a/src/cuda_zfp/encode2.cuh b/src/cuda_zfp/encode2.cuh
index 0d577d51e..3c3a96fbf 100644
--- a/src/cuda_zfp/encode2.cuh
+++ b/src/cuda_zfp/encode2.cuh
@@ -111,7 +111,8 @@ size_t encode2launch(uint2 dims,
                      int2 stride,
                      const Scalar *d_data,
                      Word *stream,
-                     const int maxbits)
+                     const int maxbits,
+                     cudaStream_t custream)
 {
   const int cuda_block_size = 128;
   dim3 block_size = dim3(cuda_block_size, 1, 1);
@@ -139,16 +140,16 @@ size_t encode2launch(uint2 dims,
   //
   size_t stream_bytes = calc_device_mem2d(zfp_pad, maxbits);
   // ensure we have zeros
-  cudaMemset(stream, 0, stream_bytes);
+  cudaMemsetAsync(stream, 0, stream_bytes, custream);
 
 #ifdef CUDA_ZFP_RATE_PRINT
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
-  cudaEventRecord(start);
+  cudaEventRecord(start, custream);
 #endif
 
-  cudaEncode2<Scalar> <<<grid_size, block_size>>>
+  cudaEncode2<Scalar> <<<grid_size, block_size, 0, custream>>>
     (maxbits,
      d_data,
      stream,
@@ -158,10 +159,11 @@ size_t encode2launch(uint2 dims,
      zfp_blocks);
 
 #ifdef CUDA_ZFP_RATE_PRINT
-  cudaDeviceSynchronize();
-  cudaEventRecord(stop);
+  // Check if the following Device Synchronize is necessary? 
+  // cudaDeviceSynchronize(); 
+  cudaEventRecord(stop, custream);
   cudaEventSynchronize(stop);
-  cudaStreamSynchronize(0);
+  cudaStreamSynchronize(custream);
 
   float milliseconds = 0.f;
   cudaEventElapsedTime(&milliseconds, start, stop);
@@ -179,9 +181,10 @@ size_t encode2(uint2 dims,
                int2 stride,
                Scalar *d_data,
                Word *stream,
-               const int maxbits)
+               const int maxbits,
+               cudaStream_t custream)
 {
-  return encode2launch<Scalar>(dims, stride, d_data, stream, maxbits);
+  return encode2launch<Scalar>(dims, stride, d_data, stream, maxbits, custream);
 }
 
 }
diff --git a/src/cuda_zfp/encode3.cuh b/src/cuda_zfp/encode3.cuh
index 1edee9e99..8a507c67e 100644
--- a/src/cuda_zfp/encode3.cuh
+++ b/src/cuda_zfp/encode3.cuh
@@ -119,7 +119,8 @@ size_t encode3launch(uint3 dims,
                      int3 stride,
                      const Scalar *d_data,
                      Word *stream,
-                     const int maxbits)
+                     const int maxbits,
+                     cudaStream_t custream)
 {
 
   const int cuda_block_size = 128;
@@ -148,16 +149,16 @@ size_t encode3launch(uint3 dims,
 
   size_t stream_bytes = calc_device_mem3d(zfp_pad, maxbits);
   //ensure we start with 0s
-  cudaMemset(stream, 0, stream_bytes);
+  cudaMemsetAsync(stream, 0, stream_bytes, custream);
 
 #ifdef CUDA_ZFP_RATE_PRINT
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
-  cudaEventRecord(start);
+  cudaEventRecord(start, custream);
 #endif
 
-  cudaEncode<Scalar> <<<grid_size, block_size>>>
+  cudaEncode<Scalar> <<<grid_size, block_size, 0, custream>>>
     (maxbits,
      d_data,
      stream,
@@ -167,9 +168,9 @@ size_t encode3launch(uint3 dims,
      zfp_blocks);
 
 #ifdef CUDA_ZFP_RATE_PRINT
-  cudaEventRecord(stop);
+  cudaEventRecord(stop, custream);
   cudaEventSynchronize(stop);
-  cudaStreamSynchronize(0);
+  cudaStreamSynchronize(custream);
 
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);
@@ -187,14 +188,17 @@ size_t encode3launch(uint3 dims,
 //
 // Just pass the raw pointer to the "real" encode
 //
+// Modifiled - Function name should be encode3, as it is followed in 1, 2 - D cases
+
 template<class Scalar>
-size_t encode(uint3 dims, 
+size_t encode3(uint3 dims, 
               int3 stride,
               Scalar *d_data,
               Word *stream,
-              const int bits_per_block)
+              const int bits_per_block,
+              cudaStream_t custream)
 {
-  return encode3launch<Scalar>(dims, stride, d_data, stream, bits_per_block);
+  return encode3launch<Scalar>(dims, stride, d_data, stream, bits_per_block, custream);
 }
 
 }
diff --git a/src/zfp.c b/src/zfp.c
index a498a985f..d46bdb452 100644
--- a/src/zfp.c
+++ b/src/zfp.c
@@ -113,6 +113,10 @@ zfp_field_alloc()
     field->nx = field->ny = field->nz = field->nw = 0;
     field->sx = field->sy = field->sz = field->sw = 0;
     field->data = 0;
+
+#ifdef ZFP_WITH_CUDA
+    field->cuStream = 0; /* set to default stream */ 
+#endif 
   }
   return field;
 }
@@ -335,6 +339,14 @@ zfp_field_set_pointer(zfp_field* field, void* data)
   field->data = data;
 }
 
+#ifdef ZFP_WITH_CUDA
+void
+zfp_field_set_cuda_stream(zfp_field* field, cudaStream_t custream) 
+{
+  field->cuStream = custream;
+}
+#endif
+
 zfp_type
 zfp_field_set_type(zfp_field* field, zfp_type type)
 {