diff --git a/src/CudaHelp.hh b/src/CudaHelp.hh
index de6b389..9326593 100644
--- a/src/CudaHelp.hh
+++ b/src/CudaHelp.hh
@@ -6,11 +6,14 @@
 #define MIN_CTAS_PER_SM     4
 #define MAX_REDUCTION_CTAS  1024
 
-#ifdef __CUDACC__
+#ifdef USE_CUDA
 #include <cuda_runtime.h>
 #include "legion.h"
+#ifndef __CUDA_HD__
 #define __CUDA_HD__ __host__ __device__
+#endif
 
+#ifdef __CUDACC__
 template<typename REDUCTION>
 __device__ __forceinline__
 void reduce_double(Legion::DeferredReduction<REDUCTION> result, double value)
@@ -39,6 +42,7 @@ void reduce_double(Legion::DeferredReduction<REDUCTION> result, double value)
     __threadfence_system();
   }
 }
+#endif
 
 #else
 #define __CUDA_HD__
diff --git a/src/Hydro.cc b/src/Hydro.cc
index 79b98e9..554a1c0 100644
--- a/src/Hydro.cc
+++ b/src/Hydro.cc
@@ -217,8 +217,8 @@ Hydro::Hydro(
     tts = new TTS(inp, this);
     qcs = new QCS(inp, this);
 
-    const double2 vfixx = double2(1., 0.);
-    const double2 vfixy = double2(0., 1.);
+    const double2 vfixx = make_double2(1., 0.);
+    const double2 vfixy = make_double2(0., 1.);
     for (int i = 0; i < bcx.size(); ++i)
       bcs.push_back(new HydroBC(mesh, vfixx, bcx[i], true/*xplane*/));
     for (int i = 0; i < bcy.size(); ++i)
@@ -331,7 +331,7 @@ void Hydro::init() {
   }
   else
   {
-    const double2 zero2(0., 0.);
+    const double2 zero2 = make_double2(0., 0.);
     FillLauncher launcher(lrp, lrp, TaskArgument(&zero2,sizeof(zero2)));
     launcher.add_field(FID_PU);
     runtime->fill_fields(ctx, launcher);
@@ -375,7 +375,7 @@ Future Hydro::doCycle(
     launchffd.argument = TaskArgument(ffdargs, sizeof(ffdargs));
     launchffd.predicate = p_not_done;
 
-    double2 ffd2args[] = { double2(0., 0.) };
+    double2 ffd2args[] = { make_double2(0., 0.) };
     IndexFillLauncher launchffd2;
     launchffd2.launch_space = ispc;
     launchffd2.projection = 0;
@@ -1802,7 +1802,7 @@ void Hydro::initRadialVelTask(
     if (pmag > args->eps)
       acc_pu[*itr] = args->vel * px / pmag;
     else
-      acc_pu[*itr] = double2(0., 0.);
+      acc_pu[*itr] = make_double2(0., 0.);
   }
 }
 
diff --git a/src/Mesh.cc b/src/Mesh.cc
index 678aade..636c7d6 100644
--- a/src/Mesh.cc
+++ b/src/Mesh.cc
@@ -163,7 +163,7 @@ const int SumOp<int>::identity = 0;
 template <>
 const double SumOp<double>::identity = 0.;
 template <>
-const double2 SumOp<double2>::identity = double2(0., 0.);
+const double2 SumOp<double2>::identity = make_double2(0., 0.);
 template <>
 const double MinOp<double>::identity = DBL_MAX;
 template <>
@@ -817,7 +817,7 @@ void Mesh::calcCtrsTask(
 
     const IndexSpace& isz = task->regions[1].region.get_index_space();
     for (PointIterator itr(runtime, isz); itr(); itr++)
-      acc_zx[*itr] = double2(0., 0.);
+      acc_zx[*itr] = make_double2(0., 0.);
 
     const IndexSpace& iss = task->regions[0].region.get_index_space();
     for (PointIterator itr(runtime, iss); itr(); itr++)
@@ -863,7 +863,7 @@ void Mesh::calcCtrsOMPTask(
     const Rect<1> rectz = runtime->get_index_space_domain(isz);
     #pragma omp parallel for
     for (coord_t z = rectz.lo[0]; z <= rectz.hi[0]; z++)
-      acc_zx[z] = double2(0., 0.);
+      acc_zx[z] = make_double2(0., 0.);
 
     const IndexSpace& iss = task->regions[0].region.get_index_space();
     // This will assert if it is not dense
@@ -1288,7 +1288,7 @@ void Mesh::calcCtrs(
 
     int zfirst = mapsz[sfirst];
     int zlast = (slast < nums ? mapsz[slast] : numz);
-    fill(&zx[zfirst], &zx[zlast], double2(0., 0.));
+    fill(&zx[zfirst], &zx[zlast], make_double2(0., 0.));
 
     for (int s = sfirst; s < slast; ++s) {
         int p1 = mapsp1[s];
diff --git a/src/QCS.cc b/src/QCS.cc
index 18df95f..3f65c16 100644
--- a/src/QCS.cc
+++ b/src/QCS.cc
@@ -146,7 +146,7 @@ void QCS::setCornerDivTask(
     // [1] Compute a zone-centered velocity
     const IndexSpace& isz = task->regions[1].region.get_index_space();
     for (PointIterator itz(runtime, isz); itz(); itz++)
-        acc_zuc[*itz] = double2(0., 0.);
+        acc_zuc[*itz] = make_double2(0., 0.);
 
     const IndexSpace& iss = task->regions[0].region.get_index_space();
     for (PointIterator its(runtime, iss); its(); its++)
@@ -488,7 +488,7 @@ void QCS::setCornerDivOMPTask(
     const Rect<1> rectz = runtime->get_index_space_domain(isz);
     #pragma omp parallel for
     for (coord_t z = rectz.lo[0]; z <= rectz.hi[0]; z++)
-        acc_zuc[z] = double2(0., 0.);
+        acc_zuc[z] = make_double2(0., 0.);
 
     const IndexSpace& iss = task->regions[0].region.get_index_space();
     // This will assert if it is not dense
diff --git a/src/Vec2.hh b/src/Vec2.hh
index 523fd7e..7f96ca4 100644
--- a/src/Vec2.hh
+++ b/src/Vec2.hh
@@ -20,7 +20,7 @@
 // This struct is defined with all functions inline,
 // to give the compiler maximum opportunity to optimize.
 
-#ifndef __CUDACC__
+#ifndef USE_CUDA
 struct double2
 {
     typedef double value_type;
@@ -37,38 +37,10 @@ struct double2
         return(*this);
     }
 
-    inline double2& operator+=(const double2& v2)
-    {
-        x += v2.x;
-        y += v2.y;
-        return(*this);
-    }
-
-    inline double2& operator-=(const double2& v2)
-    {
-        x -= v2.x;
-        y -= v2.y;
-        return(*this);
-    }
-
-    inline double2& operator*=(const double& r)
-    {
-        x *= r;
-        y *= r;
-        return(*this);
-    }
-
-    inline double2& operator/=(const double& r)
-    {
-        x /= r;
-        y /= r;
-        return(*this);
-    }
-
 }; // double2
-#endif // __CUDACC__
+#endif // USE_CUDA
 
-#ifndef __CUDACC__
+#ifndef USE_CUDA
 // Already has a decleration in cuda
 inline double2 make_double2(double x_, double y_) {
     return(double2(x_, y_));
@@ -119,6 +91,14 @@ inline double2 operator+(const double2& v1, const double2& v2)
     return make_double2(v1.x + v2.x, v1.y + v2.y);
 }
 
+__CUDA_HD__
+inline double2& operator+=(double2& v1, const double2& v2)
+{
+    v1.x += v2.x;
+    v1.y += v2.y;
+    return v1;
+}
+
 // subtract
 __CUDA_HD__
 inline double2 operator-(const double2& v1, const double2& v2)
@@ -126,6 +106,14 @@ inline double2 operator-(const double2& v1, const double2& v2)
     return make_double2(v1.x - v2.x, v1.y - v2.y);
 }
 
+__CUDA_HD__
+inline double2& operator-=(double2& v1, const double2& v2)
+{
+    v1.x -= v2.x;
+    v1.y -= v2.y;
+    return v1;
+}
+
 // multiply vector by scalar
 __CUDA_HD__
 inline double2 operator*(const double2& v, const double& r)
@@ -133,6 +121,14 @@ inline double2 operator*(const double2& v, const double& r)
     return make_double2(v.x * r, v.y * r);
 }
 
+__CUDA_HD__
+inline double2& operator*=(double2& v, const double& r)
+{
+    v.x *= r;
+    v.y *= r;
+    return v;
+}
+
 // multiply scalar by vector
 __CUDA_HD__
 inline double2 operator*(const double& r, const double2& v)
@@ -148,6 +144,14 @@ inline double2 operator/(const double2& v, const double& r)
     return make_double2(v.x * rinv, v.y * rinv);
 }
 
+__CUDA_HD__
+inline double2& operator/=(double2& v, const double& r)
+{
+    double rinv = (double) 1. / r;
+    v.x *= rinv;
+    v.y *= rinv;
+    return v;
+}
 
 // other vector operations: