diff --git a/hemi/atomic.h b/hemi/atomic.h new file mode 100644 index 0000000..09ac561 --- /dev/null +++ b/hemi/atomic.h @@ -0,0 +1,206 @@ +#ifndef __HEMI_ATOMIC_H__ +#define __HEMI_ATOMIC_H__ + +#include "hemi/hemi.h" + +// Open-MP specifics +#ifdef _OPENMP + #include +#else + /* Typedef here is to let the compiler have something reasonable for when you + * are not using OpenMP, so that it compiles. The functions do fall through + * to the ones above, though, so it should be safe to pass 0 to that without + * openmp defined and all should work. + * We assume that the lock is initialized prior to calling any of these functions. + */ +typedef unsigned int omp_lock_t; +#endif // _OPENMP + + +/* Straightforward atomic implementations for device or host code. Examples + * use OpenMP. If you need something for your threading environment of choice, + * then overload the function, put up whatever barriers are needed, and then + * call this. + * + * The only real important thing is that the overload also be + * HEMI_DEV_CALLABLE_INLINE and the basic ifdef/else/endif structure goes + * there. + * + */ + +namespace hemi +{ + // Basic functions, assumed sequentials. + HEMI_DEV_CALLABLE_INLINE int atomicAdd(int* address, int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicAdd(address, val); + #else + float old = *address; + *address = old + val; + return old; + #endif + } + + HEMI_DEV_CALLABLE_INLINE unsigned int atomicAdd(unsigned int* address, unsigned int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicAdd(address, val); + #else + unsigned int old = *address; + *address = old + val; + return old; + #endif + } + HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicAdd(unsigned long long int* address, unsigned long long int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicAdd(address, val); + #else + unsigned long long int old = *address; + *address = old + val; + return old; + #endif + } + + HEMI_DEV_CALLABLE_INLINE float atomicAdd(float* address, float val) + { + #ifdef HEMI_DEV_CODE + return ::atomicAdd(address, val); + #else + float old = *address; + *address = old + val; + return old; + #endif + } + + HEMI_DEV_CALLABLE_INLINE int atomicCAS(int* address, int compare, int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicCAS(address, compare, val); + #else + int old = *address; + *address = (old == compare ? val : old); + return old; + #endif + } + HEMI_DEV_CALLABLE_INLINE unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicCAS(address, compare, val); + #else + unsigned int old = *address; + *address = (old == compare ? val : old); + return old; + #endif + } + HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicCAS(address, compare, val); + #else + unsigned long long int old = *address; + *address = (old == compare ? val : old); + return old; + #endif + } + + /* OpenMP-supported functions. These functions lock/unlock instead of using + * named critical sections, because the named critical sections are global in + * scope. If the lock acquisition fails, the function immediately returns + * with the current value of address. + */ + + + + HEMI_DEV_CALLABLE_INLINE int atomicCAS(int* address, int compare, int val, omp_lock_t* lock) + { + #ifdef HEMI_DEV_CODE + return ::atomicCAS(address, compare, val); + #else + #ifdef _OPENMP + omp_set_lock(lock); + int old = hemi::atomicCAS(address, compare, val); + omp_unset_lock(lock); + return old; + #else + return hemi::atomicCAS(address, compare, val); + #endif // _OPENMP + #endif // HEMI_DEV_CODE + } + + HEMI_DEV_CALLABLE_INLINE unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val, omp_lock_t* lock) + { + #ifdef HEMI_DEV_CODE + return ::atomicCAS(address, compare, val); + #else + #ifdef _OPENMP + omp_set_lock(lock); + unsigned int old = hemi::atomicCAS(address, compare, val); + omp_unset_lock(lock); + return old; + #else + return hemi::atomicCAS(address, compare, val); + #endif // _OPENMP + #endif // HEMI_DEV_CODE + } + + HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val, omp_lock_t* lock) + { + #ifdef HEMI_DEV_CODE + return ::atomicCAS(address, compare, val); + #else + #ifdef _OPENMP + omp_set_lock(lock); + unsigned long long int old = hemi::atomicCAS(address, compare, val); + omp_unset_lock(lock); + return old; + #else + return hemi::atomicCAS(address, compare, val); + #endif // _OPENMP + #endif // HEMI_DEV_CODE + } + + HEMI_DEV_CALLABLE_INLINE int atomicExch(int* address, int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicExch(address,val); + #else + int old = *address; + *address = val; + return old; + #endif // HEMI_DEV_CODE + } + HEMI_DEV_CALLABLE_INLINE unsigned int atomicExch(unsigned int* address, unsigned int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicExch(address,val); + #else + unsigned int old = *address; + *address = val; + return old; + #endif // HEMI_DEV_CODE + } + HEMI_DEV_CALLABLE_INLINE unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val) + { + #ifdef HEMI_DEV_CODE + return ::atomicExch(address,val); + #else + unsigned long long int old = *address; + *address = val; + return old; + #endif // HEMI_DEV_CODE + } + HEMI_DEV_CALLABLE_INLINE float atomicExch(float* address, float val) + { + #ifdef HEMI_DEV_CODE + return ::atomicExch(address,val); + #else + float old = *address; + *address = val; + return old; + #endif // HEMI_DEV_CODE + } +} + +#endif diff --git a/hemi/math.h b/hemi/math.h new file mode 100644 index 0000000..43acc94 --- /dev/null +++ b/hemi/math.h @@ -0,0 +1,501 @@ +#ifndef __HEMI_MATH_H__ +#define __HEMI_MATH_H__ + +#include "hemi/hemi.h" +#include + + +namespace hemi +{ + // Power functions + + // templated POW function. For a CUDA device, it casts the arguments to double. + // For a host, it defaults to std::pow in + template HEMI_DEV_CALLABLE_INLINE double pow(T x, T y) + { + #ifdef HEMI_DEV_CODE + return ::pow((double)x,(double)y); + #else + return std::pow(x, y); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float pow(float x, float y) + { + #ifdef HEMI_DEV_CODE + return powf(x,y); + #else + return std::pow(x, y); + #endif + } + + // Templated to accept integral types (which effectively casts to double). + template HEMI_DEV_CALLABLE_INLINE double sqrt (T x) + { + #ifdef HEMI_DEV_CODE + return ::sqrt((double)x); + #else + return std::sqrt(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float sqrt(float x) + { + #ifdef HEMI_DEV_CODE + return sqrtf(x); + #else + return std::sqrt(x); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double cbrt (T x) + { + #ifdef HEMI_DEV_CODE + return ::cbrt((double)x); + #else + return std::pow(x,(1.0f/3.0f)); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float cbrt(float x) + { + #ifdef HEMI_DEV_CODE + return cbrtf(x); + #else + return std::pow(x,(1.0f/3.0f)); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double hypot (T x, S y) + { + #ifdef HEMI_DEV_CODE + return ::hypot((double)x, (double)y); + #else + return std::sqrt(std::pow(x,2) + std::pow(y,2)); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float hypot(float x, float y) + { + #ifdef HEMI_DEV_CODE + return hypotf(x, y); + #else + return std::sqrt(std::pow(x,2)+std::pow(y,2)); + #endif + } + + + // Absolute value functions. + template HEMI_DEV_CALLABLE_INLINE T abs (T x) + { + #ifdef HEMI_DEV_CODE + return (T)::abs((double)x); + #else + return std::abs(x); + #endif + } + + template <> HEMI_DEV_CALLABLE_INLINE int abs(int x) + { + #ifdef HEMI_DEV_CODE + return __sad(x,0,0); + #else + return ((x < 0) ? x * -1 : x); + #endif + } + + template <> HEMI_DEV_CALLABLE_INLINE unsigned int abs(unsigned int x) + { + return x; + } + + HEMI_DEV_CALLABLE_INLINE float abs(float x) + { + #ifdef HEMI_DEV_CODE + return fabsf(x); + #else + return std::abs(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE double abs(double x) + { + #ifdef HEMI_DEV_CODE + return copysign(x, 1.0); + #else + return std::abs(x); + #endif + } + + // Trig functions + // templated to catch int, long, etc. The device branch casts it to double first. + template HEMI_DEV_CALLABLE_INLINE double acos (T x) + { + #ifdef HEMI_DEV_CODE + return ::acos((double)x); + #else + return std::acos(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float acos (float x) + { + #ifdef HEMI_DEV_CODE + return acosf(x); + #else + return std::acos(x); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double asin (T x) + { + #ifdef HEMI_DEV_CODE + return ::asin((double)x); + #else + return std::asin(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float asin(float x) + { + #ifdef HEMI_DEV_CODE + return asinf(x); + #else + return std::asin(x); + #endif + } + + + template HEMI_DEV_CALLABLE_INLINE double atan (T x) + { + #ifdef HEMI_DEV_CODE + return ::atan((double)x); + #else + return std::atan(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float atan (float x) + { + #ifdef HEMI_DEV_CODE + return atanf(x); + #else + return std::atan(x); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double atan2 (T x, T y) + { + #ifdef HEMI_DEV_CODE + return ::atan2((double)x, (double) y); + #else + return std::atan2(x,y); + #endif + } + HEMI_DEV_CALLABLE_INLINE float atan2 (float x, float y) + { + #ifdef HEMI_DEV_CODE + return atan2f(x,y); + #else + return std::atan2(x,y); + #endif + } + + // Hyperbolic functions + template HEMI_DEV_CALLABLE_INLINE double acosh (T x) + { + #ifdef HEMI_DEV_CODE + return ::acosh((double)x); + #else + // since hyperbolic arccosine support isn't widespread, using defintion from Wolfram Alpha + // which is either acosh(z) = ln(z+sqrt(z+1)*sqrt(z-1)) or acosh(z) = (sqrt(z-1))/sqrt(1-z))*acos(z) + return std::log(x + std::sqrt(x+1) * std::sqrt(x-1)); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float acosh(float x) + { + #ifdef HEMI_DEV_CODE + return acoshf(x); + #else + // since hyperbolic arccosine support isn't widespread, using defintion from Wolfram Alpha + // which is either acosh(z) = ln(z+sqrt(z+1)*sqrt(z-1)) or acosh(z) = (sqrt(z-1))/sqrt(1-z))*acos(z) + return std::log(x + std::sqrt(x+1) * std::sqrt(x-1)); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double asinh (T x) + { + #ifdef HEMI_DEV_CODE + return ::asinh((double)x); + #else + return std::log(x+sqrt(1+std::pow(x,2))); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float asinh(float x) + { + #ifdef HEMI_DEV_CODE + return asinhf(x); + #else + return std::log(x+sqrt(1+std::pow(x,2))); + #endif + } + + // Rounding/remainder functions + HEMI_DEV_CALLABLE_INLINE float fmod(float x, float y) + { + #ifdef HEMI_DEV_CODE + return fmodf(x,y); + #else + return std::fmod(x,y); + #endif + } + + HEMI_DEV_CALLABLE_INLINE double fmod(double x, double y) + { + #ifdef HEMI_DEV_CODE + return ::fmod(x,y); + #else + return std::fmod(x,y); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float floor(float x) + { + #ifdef HEMI_DEV_CODE + return floorf(x); + #else + return std::floor(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE double floor(double x) + { + #ifdef HEMI_DEV_CODE + return ::floor(x); + #else + return std::floor(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float ceil(float x) + { + #ifdef HEMI_DEV_CODE + return ceilf(x); + #else + return std::ceil(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE double ceil(double x) + { + #ifdef HEMI_DEV_CODE + return ::ceil(x); + #else + return std::ceil(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE double round(double x) + { + #ifdef HEMI_DEV_CODE + return ::round(x); + #else + return (std::floor(x + 0.5) == std::ceil(x) ? std::ceil(x) : std::floor(x)); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double round(T x) + { + #ifdef HEMI_DEV_CODE + return ::round((double)x); + #else + return (std::floor((double)x + 0.5) == std::ceil(x) ? std::ceil(x) : std::floor(x)); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float round(float x) + { + #ifdef HEMI_DEV_CODE + return roundf(x); + #else + return (std::floor(x + 0.5f) == std::ceil(x) ? std::ceil(x) : std::floor(x)); + #endif + } + + HEMI_DEV_CALLABLE_INLINE long int lround(double x) + { + #ifdef HEMI_DEV_CODE + return ::lround(x); + #else + return (std::floor(x + 0.5) == std::ceil(x) ? (long int)std::ceil(x) : (long int)std::floor(x)); + #endif + } + + HEMI_DEV_CALLABLE_INLINE long int lround(float x) + { + #ifdef HEMI_DEV_CODE + return lroundf(x); + #else + return (std::floor(x + 0.5f) == std::ceil(x) ? (long int)std::ceil(x) : (long int)std::floor(x)); + #endif + } + + // Exponential/log functions + template HEMI_DEV_CALLABLE_INLINE double exp(T x) + { + #ifdef HEMI_DEV_CODE + return ::exp((double)x); + #else + return std::exp(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float exp(float x) + { + #ifdef HEMI_DEV_CODE + return expf(x); + #else + return std::exp(x); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double log(T x) + { + #ifdef HEMI_DEV_CODE + return ::log((double)x); + #else + return std::log(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float log(float x) + { + #ifdef HEMI_DEV_CODE + return logf(x); + #else + return std::log(x); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double log10(T x) + { + #ifdef HEMI_DEV_CODE + return ::log10((double)x); + #else + return std::log10(x); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float log10(float x) + { + #ifdef HEMI_DEV_CODE + return log10f(x); + #else + return std::log10(x); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double frexp(T x, int* ntpr) + { + #ifdef HEMI_DEV_CODE + return ::frexp((double)x, ntpr); + #else + return std::frexp(x, ntpr); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float frexp(float x, int* ntpr) + { + #ifdef HEMI_DEV_CODE + return frexpf(x, ntpr); + #else + return std::frexp(x, ntpr); + #endif + } + + template HEMI_DEV_CALLABLE_INLINE double ldexp(T x, int ntpr) + { + #ifdef HEMI_DEV_CODE + return ::ldexp((double)x, ntpr); + #else + return std::ldexp(x, ntpr); + #endif + } + + HEMI_DEV_CALLABLE_INLINE float ldexp(float x, int ntpr) + { + #ifdef HEMI_DEV_CODE + return ldexpf(x, ntpr); + #else + return std::ldexp(x,ntpr); + #endif + } + + // integer functions + HEMI_DEV_CALLABLE_INLINE unsigned int brev(unsigned int x) + { + #ifdef HEMI_DEV_CODE + return __brev(x); + #else + unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 + unsigned int mask = ~0; + while ((s >>= 1) > 0) + { + mask ^= (mask << s); + x = ((x >> s) & mask) | ((x << s) & ~mask); + } + return x; + #endif + } + + HEMI_DEV_CALLABLE_INLINE unsigned long long int brev(unsigned long long int x) + { + #ifdef HEMI_DEV_CODE + return __brevll(x); + #else + unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 + unsigned int mask = ~0; + while ((s >>= 1) > 0) + { + mask ^= (mask << s); + x = ((x >> s) & mask) | ((x << s) & ~mask); + } + return x; + #endif + } + + HEMI_DEV_CALLABLE_INLINE unsigned int popc(unsigned int x) + { + #ifdef HEMI_DEV_CODE + return __popc(x); + #else + unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 + unsigned int count = 0; + for (s = sizeof(x) * CHAR_BIT; s > 0; s--) + { + count += (x & 1) > 0; + x >>= 1; + } + return count; + #endif + } + + HEMI_DEV_CALLABLE_INLINE unsigned long long int popc(unsigned long long int x) + { + #ifdef HEMI_DEV_CODE + return __popcll(x); + #else + unsigned int s = sizeof(x) * CHAR_BIT; // bit size; must be power of 2 + unsigned int count = 0; + for (s = sizeof(x) * CHAR_BIT; s > 0; s--) + { + count += (x & 1) > 0; + x >>= 1; + } + return count; + #endif + } +} +#endif // HEMI_MATH_H diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..50def17 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,31 @@ +# Change this to point to wherever CUDA is installed on your local system. +CUDA_PATH := /opt/net/apps/cuda +NVCC=${CUDA_PATH}/bin/nvcc +CXX=g++ + +CC_FLAGS := -I../ -I${CUDA_PATH}/include + +NVCC_FLAGS := ${CC_FLAGS} -arch=sm_21 +ARCH := $(shell getconf LONG_BIT) + +LIB_FLAGS_32 := -L$(CUDA_PATH)/lib +LIB_FLAGS_64 := -L$(CUDA_PATH)/lib64 + +LIB_FLAGS := $(LIB_FLAGS_$(ARCH)) -lcudart + +.PHONY: math clean +all: math + +math: math_host_g++ math_host_nvcc math_device + for prog in $^; do \ + ./$$prog; \ + done + +math_device: math.cpp + ${NVCC} ${LIB_FLAGS} -x cu ${CC_FLAGS} ${NVCC_FLAGS} $< -o $@ + +math_host_g++: math.cpp + ${CXX} ${LIB_FLAGS} ${CC_FLAGS} $< -o $@ + +math_host_nvcc: math.cpp + ${NVCC} ${LIB_FLAGS} ${CC_FLAGS} ${NVCC_FLAGS} $< -o $@ diff --git a/test/math.cpp b/test/math.cpp new file mode 100644 index 0000000..007309d --- /dev/null +++ b/test/math.cpp @@ -0,0 +1,64 @@ +#include +#include +#include +#include + + +HEMI_DEFINE_CONSTANT(unsigned int N,6); +// Test harness for the math functions in hemi/math.h +// Uses hemi::Array + +HEMI_KERNEL(PowF)(const float *in, float *out) +{ + out[0] = hemi::pow(in[0],in[1]); + out[1] = hemi::pow(in[1],in[2]); + out[2] = hemi::pow(in[2],in[3]); + assert(out[0] == 0.0f); + assert(out[1] == 1.0f); + assert(out[2] == 8.0f); +} +HEMI_KERNEL(PowD)(const double *in, double *out) +{ + out[0] = hemi::pow(in[0],in[1]); + out[1] = hemi::pow(in[1],in[2]); + out[2] = hemi::pow(in[2],in[3]); + printf("%g^(%g) = %e\n", in[0], in[1], hemi::round(out[0])); + out[0] = hemi::round(out[0]); + assert(hemi::round(out[0]) == 0); + assert(out[1] == 1.0); + assert(out[2] == 8.0); + +} + +HEMI_KERNEL(Round)() +{ + printf("Testing rounding functions.\n"); + assert(hemi::round(0.5f) == 1.0f); + assert(hemi::round(0.4f) == 0.0f); + assert(hemi::round(0.4) == 0.0); +} + +int main() +{ + hemi::Array dbl_test_in(HEMI_CONSTANT(N), false); + hemi::Array dbl_test_out(HEMI_CONSTANT(N), false); + hemi::Array flt_test_in(HEMI_CONSTANT(N), false); + hemi::Array flt_test_out(HEMI_CONSTANT(N), false); + + printf("%s: Initializing memory for %d-length arrays\n", HEMI_LOC_STRING, HEMI_CONSTANT(N)); + for (int i = 0; i < HEMI_CONSTANT(N); i++) + { + dbl_test_in.writeOnlyHostPtr()[i] = i * 1.0; + dbl_test_out.writeOnlyHostPtr()[i] = 0; + flt_test_in.writeOnlyHostPtr()[i] = i * 1.0f; + flt_test_out.writeOnlyHostPtr()[i] = 0; + } + + int gridDim = 1, blockDim = 1; + printf("Running %s Version...\n", HEMI_LOC_STRING); + + HEMI_KERNEL_LAUNCH(Round, gridDim, blockDim, 0,0); + HEMI_KERNEL_LAUNCH(PowF, gridDim, blockDim, 0,0, flt_test_in.readOnlyPtr(), flt_test_out.writeOnlyPtr()); + HEMI_KERNEL_LAUNCH(PowD, gridDim, blockDim, 0,0, dbl_test_in.readOnlyPtr(), dbl_test_out.writeOnlyPtr()); + +}