diff --git a/hemi/atomic.h b/hemi/atomic.h
new file mode 100644
index 0000000..09ac561
--- /dev/null
+++ b/hemi/atomic.h
@@ -0,0 +1,206 @@
+#ifndef __HEMI_ATOMIC_H__
+#define __HEMI_ATOMIC_H__
+
+#include "hemi/hemi.h"
+
+// Open-MP specifics
+#ifdef _OPENMP
+  #include <omp.h>
+#else
+  /* Typedef here is to let the compiler have something reasonable for when you
+   * are not using OpenMP, so that it compiles. The functions do fall through
+   * to the ones above, though, so it should be safe to pass 0 to that without
+   * openmp defined and all should work.
+   * We assume that the lock is initialized prior to calling any of these functions.
+   */
+typedef unsigned int omp_lock_t;
+#endif // _OPENMP
+
+
+/* Straightforward atomic implementations for device or host code.  Examples
+ * use OpenMP. If you need something for your threading environment of choice,
+ * then overload the function, put up whatever barriers are needed, and then
+ * call this.
+ *
+ * The only real important thing is that the overload also be
+ * HEMI_DEV_CALLABLE_INLINE and the basic ifdef/else/endif structure goes
+ * there.
+ *
+ */
+
+namespace hemi
+{
+  // Basic functions, assumed sequentials.
+  HEMI_DEV_CALLABLE_INLINE int atomicAdd(int* address, int val)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicAdd(address, val);
+    #else
+      float old = *address;
+      *address = old + val;
+      return old;
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE unsigned int atomicAdd(unsigned int* address, unsigned int val)
+  {   
+    #ifdef HEMI_DEV_CODE
+      return ::atomicAdd(address, val);
+    #else
+      unsigned int old = *address;
+      *address = old + val;
+      return old;
+    #endif
+  } 
+  HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicAdd(unsigned long long int* address, unsigned long long int val)
+  {   
+    #ifdef HEMI_DEV_CODE
+      return ::atomicAdd(address, val);
+    #else
+      unsigned long long int old = *address;
+      *address = old + val;
+      return old;
+    #endif
+  } 
+  
+  HEMI_DEV_CALLABLE_INLINE float atomicAdd(float* address, float val)
+  {   
+    #ifdef HEMI_DEV_CODE
+      return ::atomicAdd(address, val);
+    #else
+      float old = *address;
+      *address = old + val;
+      return old;
+    #endif
+  } 
+
+  HEMI_DEV_CALLABLE_INLINE int atomicCAS(int* address, int compare, int val) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicCAS(address, compare, val);
+    #else
+      int old = *address;
+      *address = (old == compare ? val : old);
+      return old; 
+    #endif
+  }
+  HEMI_DEV_CALLABLE_INLINE unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicCAS(address, compare, val);
+    #else
+      unsigned int old = *address;
+      *address = (old == compare ? val : old);
+      return old; 
+    #endif
+  }
+  HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicCAS(address, compare, val);
+    #else
+      unsigned long long int old = *address;
+      *address = (old == compare ? val : old);
+      return old; 
+    #endif
+  }
+
+  /* OpenMP-supported functions. These functions lock/unlock instead of using
+   * named critical sections, because the named critical sections are global in
+   * scope. If the lock acquisition fails, the function immediately returns
+   * with the current value of address.
+   */
+
+
+
+  HEMI_DEV_CALLABLE_INLINE int atomicCAS(int* address, int compare, int val, omp_lock_t* lock) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicCAS(address, compare, val);
+    #else
+      #ifdef _OPENMP
+        omp_set_lock(lock);
+        int old = hemi::atomicCAS(address, compare, val);
+        omp_unset_lock(lock);
+        return old; 
+      #else
+        return hemi::atomicCAS(address, compare, val);
+      #endif // _OPENMP
+    #endif // HEMI_DEV_CODE
+  }
+
+  HEMI_DEV_CALLABLE_INLINE unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val, omp_lock_t* lock) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicCAS(address, compare, val);
+    #else
+      #ifdef _OPENMP
+        omp_set_lock(lock);
+        unsigned int old = hemi::atomicCAS(address, compare, val);
+        omp_unset_lock(lock);
+        return old; 
+      #else
+        return hemi::atomicCAS(address, compare, val);
+      #endif // _OPENMP
+    #endif // HEMI_DEV_CODE
+  }
+
+  HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val, omp_lock_t* lock) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicCAS(address, compare, val);
+    #else
+      #ifdef _OPENMP
+        omp_set_lock(lock);
+        unsigned long long int old = hemi::atomicCAS(address, compare, val);
+        omp_unset_lock(lock);
+        return old; 
+      #else
+        return hemi::atomicCAS(address, compare, val);
+      #endif // _OPENMP
+    #endif // HEMI_DEV_CODE
+  }
+
+  HEMI_DEV_CALLABLE_INLINE int atomicExch(int* address, int val) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicExch(address,val);
+    #else
+      int old = *address;
+      *address = val;
+      return old;
+    #endif // HEMI_DEV_CODE
+  }
+  HEMI_DEV_CALLABLE_INLINE unsigned int atomicExch(unsigned int* address, unsigned int val) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicExch(address,val);
+    #else
+      unsigned int old = *address;
+      *address = val;
+      return old;
+    #endif // HEMI_DEV_CODE
+  }
+  HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicExch(address,val);
+    #else
+      unsigned long long int old = *address;
+      *address = val;
+      return old;
+    #endif // HEMI_DEV_CODE
+  }
+  HEMI_DEV_CALLABLE_INLINE float atomicExch(float* address, float val) 
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atomicExch(address,val);
+    #else
+      float old = *address;
+      *address = val;
+      return old;
+    #endif // HEMI_DEV_CODE
+  }
+}
+
+#endif
diff --git a/hemi/math.h b/hemi/math.h
new file mode 100644
index 0000000..43acc94
--- /dev/null
+++ b/hemi/math.h
@@ -0,0 +1,501 @@
+#ifndef __HEMI_MATH_H__
+#define __HEMI_MATH_H__
+
+#include "hemi/hemi.h"
+#include <cmath>
+
+
+namespace hemi 
+{
+  // Power functions
+
+  // templated POW function. For a CUDA device, it casts the arguments to double.
+  // For a host, it defaults to std::pow in <cmath>
+  template <class T> HEMI_DEV_CALLABLE_INLINE double pow(T x, T y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::pow((double)x,(double)y);
+    #else
+      return std::pow(x, y);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float pow(float x, float y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return powf(x,y);
+    #else
+      return std::pow(x, y);
+    #endif
+  }
+
+  // Templated to accept integral types (which effectively casts to double). 
+  template <class T> HEMI_DEV_CALLABLE_INLINE double sqrt (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::sqrt((double)x);
+    #else
+      return std::sqrt(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float sqrt(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return sqrtf(x);
+    #else
+      return std::sqrt(x);
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double cbrt (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::cbrt((double)x);
+    #else
+      return std::pow(x,(1.0f/3.0f));
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float cbrt(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return cbrtf(x);
+    #else
+      return std::pow(x,(1.0f/3.0f));
+    #endif
+  }
+
+  template <class T, class S> HEMI_DEV_CALLABLE_INLINE double hypot (T x, S y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::hypot((double)x, (double)y);
+    #else
+      return std::sqrt(std::pow(x,2) + std::pow(y,2));
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float hypot(float x, float y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return hypotf(x, y);
+    #else
+      return std::sqrt(std::pow(x,2)+std::pow(y,2));
+    #endif
+  }
+
+
+  // Absolute value functions.
+  template <class T> HEMI_DEV_CALLABLE_INLINE T abs (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return (T)::abs((double)x);
+    #else
+      return std::abs(x);
+    #endif
+  }
+
+  template <> HEMI_DEV_CALLABLE_INLINE int abs<int>(int x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return __sad(x,0,0);
+    #else
+      return ((x < 0) ? x * -1 : x);
+    #endif
+  }
+
+  template <> HEMI_DEV_CALLABLE_INLINE unsigned int abs<unsigned int>(unsigned int x)
+  {
+    return x;
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float abs(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return fabsf(x);
+    #else
+      return std::abs(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE double abs(double x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return copysign(x, 1.0);
+    #else
+      return std::abs(x);
+    #endif
+  }
+
+  // Trig functions
+  // templated to catch int, long, etc. The device branch casts it to double first. 
+  template <class T> HEMI_DEV_CALLABLE_INLINE double acos (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::acos((double)x);
+    #else
+      return std::acos(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float acos (float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return acosf(x);
+    #else
+      return std::acos(x);
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double asin (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::asin((double)x);
+    #else
+      return std::asin(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float asin(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return asinf(x);
+    #else
+      return std::asin(x);
+    #endif
+  }
+
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double atan (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atan((double)x);
+    #else
+      return std::atan(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float atan (float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return atanf(x);
+    #else
+      return std::atan(x);
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double atan2 (T x, T y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::atan2((double)x, (double) y);
+    #else
+      return std::atan2(x,y);
+    #endif
+  }
+  HEMI_DEV_CALLABLE_INLINE float atan2 (float x, float y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return atan2f(x,y);
+    #else
+      return std::atan2(x,y);
+    #endif
+  }
+
+  // Hyperbolic functions
+  template <class T> HEMI_DEV_CALLABLE_INLINE double acosh (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::acosh((double)x);
+    #else
+      // since hyperbolic arccosine support isn't widespread, using defintion from Wolfram Alpha
+      // which is either acosh(z) = ln(z+sqrt(z+1)*sqrt(z-1)) or acosh(z) = (sqrt(z-1))/sqrt(1-z))*acos(z)
+      return std::log(x + std::sqrt(x+1) * std::sqrt(x-1));
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float acosh(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return acoshf(x);
+    #else
+      // since hyperbolic arccosine support isn't widespread, using defintion from Wolfram Alpha
+      // which is either acosh(z) = ln(z+sqrt(z+1)*sqrt(z-1)) or acosh(z) = (sqrt(z-1))/sqrt(1-z))*acos(z)
+      return std::log(x + std::sqrt(x+1) * std::sqrt(x-1));
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double asinh (T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::asinh((double)x);
+    #else
+      return std::log(x+sqrt(1+std::pow(x,2)));
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float asinh(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return asinhf(x);
+    #else
+      return std::log(x+sqrt(1+std::pow(x,2)));
+    #endif
+  }
+
+  // Rounding/remainder functions
+  HEMI_DEV_CALLABLE_INLINE float fmod(float x, float y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return fmodf(x,y);
+    #else
+      return std::fmod(x,y);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE double fmod(double x, double y)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::fmod(x,y);
+    #else
+      return std::fmod(x,y);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float floor(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return floorf(x);
+    #else
+      return std::floor(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE double floor(double x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::floor(x);
+    #else
+      return std::floor(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float ceil(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ceilf(x);
+    #else
+      return std::ceil(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE double ceil(double x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::ceil(x);
+    #else
+      return std::ceil(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE double round(double x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::round(x);
+    #else
+      return (std::floor(x + 0.5) == std::ceil(x) ? std::ceil(x) : std::floor(x));
+    #endif
+  }
+
+  template <class T>   HEMI_DEV_CALLABLE_INLINE double round(T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::round((double)x);
+    #else
+      return (std::floor((double)x + 0.5) == std::ceil(x) ? std::ceil(x) : std::floor(x));
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float round(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return roundf(x);
+    #else
+      return (std::floor(x + 0.5f) == std::ceil(x) ? std::ceil(x) : std::floor(x));
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE long int lround(double x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::lround(x);
+    #else
+      return (std::floor(x + 0.5) == std::ceil(x) ? (long int)std::ceil(x) : (long int)std::floor(x));
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE long int lround(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return lroundf(x);
+    #else
+      return (std::floor(x + 0.5f) == std::ceil(x) ? (long int)std::ceil(x) : (long int)std::floor(x));
+    #endif
+  }
+
+  // Exponential/log functions
+  template <class T> HEMI_DEV_CALLABLE_INLINE double exp(T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::exp((double)x);
+    #else
+      return std::exp(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float exp(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return expf(x);
+    #else
+      return std::exp(x);
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double log(T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::log((double)x);
+    #else
+      return std::log(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float log(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return logf(x);
+    #else
+      return std::log(x);
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double log10(T x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::log10((double)x);
+    #else
+      return std::log10(x);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float log10(float x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return log10f(x);
+    #else
+      return std::log10(x);
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double frexp(T x, int* ntpr)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::frexp((double)x, ntpr);
+    #else
+      return std::frexp(x, ntpr);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float frexp(float x, int* ntpr)
+  {
+    #ifdef HEMI_DEV_CODE
+      return frexpf(x, ntpr);
+    #else
+      return std::frexp(x, ntpr);
+    #endif
+  }
+
+  template <class T> HEMI_DEV_CALLABLE_INLINE double ldexp(T x, int ntpr)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ::ldexp((double)x, ntpr);
+    #else
+      return std::ldexp(x, ntpr);
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE float ldexp(float x, int ntpr)
+  {
+    #ifdef HEMI_DEV_CODE
+      return ldexpf(x, ntpr);
+    #else
+      return std::ldexp(x,ntpr);
+    #endif
+  }
+
+  // integer functions
+  HEMI_DEV_CALLABLE_INLINE unsigned int brev(unsigned int x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return __brev(x);
+    #else
+      unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 
+      unsigned int mask = ~0;         
+      while ((s >>= 1) > 0) 
+      {
+        mask ^= (mask << s);
+        x = ((x >> s) & mask) | ((x << s) & ~mask);
+      }
+      return x;
+    #endif 
+  }
+
+  HEMI_DEV_CALLABLE_INLINE unsigned long long int brev(unsigned long long int x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return __brevll(x);
+    #else
+        unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 
+        unsigned int mask = ~0;         
+        while ((s >>= 1) > 0) 
+        {
+          mask ^= (mask << s);
+          x = ((x >> s) & mask) | ((x << s) & ~mask);
+        }
+        return x;
+    #endif 
+  }
+
+  HEMI_DEV_CALLABLE_INLINE unsigned int popc(unsigned int x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return __popc(x);
+    #else
+      unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 
+      unsigned int count = 0;
+      for (s = sizeof(x) * CHAR_BIT; s > 0; s--)
+      {
+        count += (x & 1) > 0;
+        x >>= 1;
+      }
+      return count;
+    #endif
+  }
+
+  HEMI_DEV_CALLABLE_INLINE unsigned long long int popc(unsigned long long int x)
+  {
+    #ifdef HEMI_DEV_CODE
+      return __popcll(x);
+    #else
+      unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 
+      unsigned int count = 0;
+      for (s = sizeof(x) * CHAR_BIT; s > 0; s--)
+      {
+        count += (x & 1) > 0;
+        x >>= 1;
+      }
+      return count;
+    #endif
+  }
+}
+#endif // HEMI_MATH_H
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 0000000..50def17
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,31 @@
+# Change this to point to wherever CUDA is installed on your local system.
+CUDA_PATH := /opt/net/apps/cuda
+NVCC=${CUDA_PATH}/bin/nvcc
+CXX=g++
+
+CC_FLAGS := -I../ -I${CUDA_PATH}/include
+
+NVCC_FLAGS := ${CC_FLAGS} -arch=sm_21 
+ARCH := $(shell getconf LONG_BIT)
+
+LIB_FLAGS_32 := -L$(CUDA_PATH)/lib
+LIB_FLAGS_64 := -L$(CUDA_PATH)/lib64
+
+LIB_FLAGS := $(LIB_FLAGS_$(ARCH)) -lcudart
+
+.PHONY: math clean
+all: math
+
+math: math_host_g++ math_host_nvcc math_device 
+	for prog in $^; do \
+		./$$prog; \
+	done
+
+math_device: math.cpp
+	${NVCC} ${LIB_FLAGS} -x cu ${CC_FLAGS} ${NVCC_FLAGS} $< -o $@
+
+math_host_g++: math.cpp
+	${CXX} ${LIB_FLAGS} ${CC_FLAGS} $< -o $@
+
+math_host_nvcc: math.cpp
+	${NVCC} ${LIB_FLAGS} ${CC_FLAGS} ${NVCC_FLAGS} $< -o $@
diff --git a/test/math.cpp b/test/math.cpp
new file mode 100644
index 0000000..007309d
--- /dev/null
+++ b/test/math.cpp
@@ -0,0 +1,64 @@
+#include <hemi/hemi.h>
+#include <hemi/array.h>
+#include <hemi/math.h>
+#include <cuda.h>
+
+
+HEMI_DEFINE_CONSTANT(unsigned int N,6);
+// Test harness for the math functions in hemi/math.h 
+// Uses hemi::Array
+
+HEMI_KERNEL(PowF)(const float *in, float *out)
+{
+  out[0] = hemi::pow(in[0],in[1]);
+  out[1] = hemi::pow(in[1],in[2]);
+  out[2] = hemi::pow(in[2],in[3]);
+  assert(out[0] == 0.0f);
+  assert(out[1] == 1.0f);
+  assert(out[2] == 8.0f);
+}
+HEMI_KERNEL(PowD)(const double *in, double *out)
+{
+  out[0] = hemi::pow(in[0],in[1]);
+  out[1] = hemi::pow(in[1],in[2]);
+  out[2] = hemi::pow(in[2],in[3]);
+  printf("%g^(%g) = %e\n", in[0], in[1], hemi::round(out[0]));
+  out[0] = hemi::round(out[0]);
+  assert(hemi::round(out[0]) == 0);
+  assert(out[1] == 1.0);
+  assert(out[2] == 8.0);
+
+}
+
+HEMI_KERNEL(Round)()
+{
+  printf("Testing rounding functions.\n");
+  assert(hemi::round(0.5f) == 1.0f);
+  assert(hemi::round(0.4f) == 0.0f);
+  assert(hemi::round(0.4) == 0.0);
+}
+
+int main()
+{
+  hemi::Array<double> dbl_test_in(HEMI_CONSTANT(N), false);
+  hemi::Array<double> dbl_test_out(HEMI_CONSTANT(N), false);
+  hemi::Array<float> flt_test_in(HEMI_CONSTANT(N), false);
+  hemi::Array<float> flt_test_out(HEMI_CONSTANT(N), false);
+
+  printf("%s: Initializing memory for %d-length arrays\n", HEMI_LOC_STRING, HEMI_CONSTANT(N));
+  for (int i = 0; i < HEMI_CONSTANT(N); i++)
+  {
+    dbl_test_in.writeOnlyHostPtr()[i] = i * 1.0;
+    dbl_test_out.writeOnlyHostPtr()[i] = 0;
+    flt_test_in.writeOnlyHostPtr()[i] = i * 1.0f;
+    flt_test_out.writeOnlyHostPtr()[i] = 0;
+  }
+
+  int gridDim = 1, blockDim = 1;
+  printf("Running %s Version...\n", HEMI_LOC_STRING);
+
+  HEMI_KERNEL_LAUNCH(Round, gridDim, blockDim, 0,0);
+  HEMI_KERNEL_LAUNCH(PowF, gridDim, blockDim, 0,0, flt_test_in.readOnlyPtr(), flt_test_out.writeOnlyPtr());
+  HEMI_KERNEL_LAUNCH(PowD, gridDim, blockDim, 0,0, dbl_test_in.readOnlyPtr(), dbl_test_out.writeOnlyPtr());
+
+}