From ce4f49e08b37e71fe56b5d91d3f439e7afdeb6d9 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 19:08:25 +0000 Subject: [PATCH] Add dot kernel to HIP implementation --- HIPStream.cu | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ HIPStream.h | 6 ++++++ 2 files changed, 55 insertions(+) diff --git a/HIPStream.cu b/HIPStream.cu index 8c02348a..d14fe844 100644 --- a/HIPStream.cu +++ b/HIPStream.cu @@ -9,6 +9,7 @@ #include "hip/hip_runtime.h" #define TBSIZE 1024 +#define DOT_NUM_BLOCKS 256 void check_error(void) { @@ -47,6 +48,9 @@ HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; + // Allocate the host array for partial sums for dot kernels + sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, 0); @@ -60,6 +64,8 @@ HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); + hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + check_error(); } @@ -172,6 +178,49 @@ void HIPStream::triad() } +template +__global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size) +{ + + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T *tb_sum = reinterpret_cast(smem); + + int i = blockDim.x * blockIdx.x + threadIdx.x; + const size_t local_i = threadIdx.x; + + tb_sum[local_i] = 0.0; + for (; i < array_size; i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + { + __syncthreads(); + if (local_i < offset) + { + tb_sum[local_i] += tb_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; +} + +template +T HIPStream::dot() +{ + hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), sizeof(T)*TBSIZE, 0, d_a, d_b, d_sum, array_size); + check_error(); + + hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); + check_error(); + + T sum = 0.0; + for (int i = 0; i < DOT_NUM_BLOCKS; i++) + sum += sums[i]; + + return sum; +} + void listDevices(void) { // Get number of devices diff --git a/HIPStream.h b/HIPStream.h index 392080ad..9209388a 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -21,10 +21,15 @@ class HIPStream : public Stream protected: // Size of arrays unsigned int array_size; + + // Host array for partial sums for dot kernel + T *sums; + // Device side pointers to arrays T *d_a; T *d_b; T *d_c; + T *d_sum; public: @@ -36,6 +41,7 @@ class HIPStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override;