Skip to content

Commit

Permalink
[gpu/enhance] Utility for registering Blas kernels during initialization
Browse files Browse the repository at this point in the history
Default Blas kernel registration during cl_context initialization
Remove RunLayerContext dependency from unit tests

Signed-off-by: Debadri Samaddar <[email protected]>
  • Loading branch information
s-debadri authored and jijoongmoon committed Oct 4, 2024
1 parent 119c60e commit 79a7c25
Show file tree
Hide file tree
Showing 9 changed files with 306 additions and 254 deletions.
30 changes: 30 additions & 0 deletions nntrainer/cl_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <addition_layer_cl.h>
#include <blas_kernel_strings.h>
#include <cl_context.h>
#include <concat_cl.h>
#include <fc_layer_cl.h>
Expand Down Expand Up @@ -123,6 +124,35 @@ const int ClContext::registerFactory(const FactoryType<T> factory,
return assigned_int_key;
}

void ClContext::initBlasClKernels() {
if (blas_kernels_initialized) {
ml_logi(
"ClContext: Default blas kernels already registered and initialized");
return;
}

registerClKernel(sgemv_cl_kernel_, "sgemv_cl");
registerClKernel(dot_cl_kernel_, "dot_cl");
registerClKernel(sgemm_cl_noTrans_kernel_, "sgemm_cl_noTrans");
registerClKernel(sgemm_cl_transA_kernel_, "sgemm_cl_transA");
registerClKernel(sgemm_cl_transB_kernel_, "sgemm_cl_transB");
registerClKernel(sgemm_cl_transAB_kernel_, "sgemm_cl_transAB");
registerClKernel(addition_cl_kernel_, "addition_cl");
registerClKernel(sscal_cl_kernel_, "sscal_cl");

#ifdef ENABLE_FP16
registerClKernel(sgemv_cl_kernel_fp16_, "sgemv_cl_fp16");
registerClKernel(dot_cl_kernel_fp16_, "dot_cl_fp16");
registerClKernel(sgemm_cl_noTrans_kernel_fp16_, "sgemm_cl_noTrans_fp16");
registerClKernel(sgemm_cl_transA_kernel_fp16_, "sgemm_cl_transA_fp16");
registerClKernel(sgemm_cl_transB_kernel_fp16_, "sgemm_cl_transB_fp16");
registerClKernel(sgemm_cl_transAB_kernel_fp16_, "sgemm_cl_transAB_fp16");
registerClKernel(addition_cl_kernel_fp16_, "addition_cl_fp16");
registerClKernel(sscal_cl_kernel_fp16_, "sscal_cl_fp16");
#endif
blas_kernels_initialized = true;
}

const ClContext::SharedPtrClKernel &
ClContext::registerClKernel(std::string kernel_string,
std::string kernel_name) {
Expand Down
12 changes: 10 additions & 2 deletions nntrainer/cl_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ class ClContext {
const SharedPtrClKernel &registerClKernel(std::string kernel_string,
std::string kernel_name);

/**
* @brief Initialize and register all blas OpenCl kernels
*/
void initBlasClKernels();

/**
* @brief destructor to release opencl commandQueue
*/
Expand All @@ -221,12 +226,15 @@ class ClContext {
// flag to check opencl commandqueue and context inititalization
bool cl_initialized = false;

// flag to check default blas kernels registered or not
bool blas_kernels_initialized = false;

FactoryMap<nntrainer::Layer> factory_map;

template <typename Args, typename T> struct isSupportedHelper;

// map to store initialized opencl::Kernel
OclKernelMap ocl_kernel_map;
// global map to store initialized opencl::Kernel
inline static OclKernelMap ocl_kernel_map;

/**
* @brief supportHelper to check if given type is supported within cl context
Expand Down
2 changes: 2 additions & 0 deletions nntrainer/layers/layer_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ createLayerNode(const ml::train::LayerType &type,
#ifdef ENABLE_OPENCL
if (compute_engine == ml::train::LayerComputeEngine::GPU) {
auto &cc = nntrainer::ClContext::Global();
cc.initBlasClKernels();
return createLayerNode(cc.createObject<nntrainer::Layer>(type), properties,
compute_engine);
}
Expand All @@ -157,6 +158,7 @@ createLayerNode(const std::string &type,
#ifdef ENABLE_OPENCL
if (compute_engine == ml::train::LayerComputeEngine::GPU) {
auto &cc = nntrainer::ClContext::Global();
cc.initBlasClKernels();
return createLayerNode(cc.createObject<nntrainer::Layer>(type), properties,
compute_engine);
}
Expand Down
249 changes: 249 additions & 0 deletions nntrainer/tensor/cl_operations/blas_kernel_strings.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2024 Debadri Samaddar <[email protected]>
*
* @file blas_kernel_strings.h
* @date 18 Sep 2024
* @brief All blas OpenCL kernel strings
* @see https://github.com/nnstreamer/nntrainer
* @author Debadri Samaddar <[email protected]>
* @bug No known bugs except for NYI items
*
*/

#ifndef __BLAS_KERNEL_STRINGS_H__
#define __BLAS_KERNEL_STRINGS_H__

#include <string>

namespace nntrainer {
static const std::string sgemv_cl_kernel_ =
R"(__kernel void sgemv_cl(const __global float* A, const __global float* X,
__global float* Y, unsigned int N, unsigned int lda) {
unsigned int i;
i = get_global_id(0);
float y0 = 0.0f;
for (unsigned int j = 0; j < N; j++)
y0 += A[i + j * lda] * X[j];
Y[i] = y0;
})";

static const std::string dot_cl_kernel_ =
R"(__kernel void dot_cl(const __global float* A, const __global float* X, unsigned int K, __global float* res) {
*res = 0;
for (unsigned int i = 0; i < K; i++){
*res += A[i] * X[i];
}
})";

static const std::string sgemm_cl_noTrans_kernel_ =
R"(__kernel void sgemm_cl_noTrans(const __global float* A, const __global float* B,
__global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[m * lda + k];
b = B[k * ldb + n];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string sgemm_cl_transA_kernel_ =
R"(__kernel void sgemm_cl_transA(const __global float* A, const __global float* B,
__global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[k * lda + m];
b = B[k * ldb + n];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string sgemm_cl_transB_kernel_ =
R"(__kernel void sgemm_cl_transB(const __global float *A, const __global float *B,
__global float *C, unsigned int K,
unsigned int lda, unsigned int ldb,
unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[m * lda + k];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string sgemm_cl_transAB_kernel_ =
R"(__kernel void sgemm_cl_transAB(const __global float *A, const __global float *B,
__global float *C, unsigned int K,
unsigned int lda, unsigned int ldb,
unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[k * lda + m];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
#pragma printf_support
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
}
})";

static const std::string sscal_cl_kernel_ =
R"(__kernel void sscal_cl(__global float* X, const float alpha) {
unsigned int i = get_global_id(0);
X[i] *= alpha;
})";

#ifdef ENABLE_FP16
static const std::string sgemv_cl_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void sgemv_cl_fp16(const __global half* A, const __global half* X,
__global half* Y, unsigned int N, unsigned int lda) {
unsigned int i;
i = get_global_id(0);
half y0 = 0.0f;
for (unsigned int j = 0; j < N; j++)
y0 += A[i + j * lda] * X[j];
Y[i] = y0;
})";

static const std::string dot_cl_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void dot_cl_fp16(const __global half* A, const __global half* X, unsigned int K, __global half* res) {
*res = 0;
for (unsigned int i = 0; i < K; i++){
*res += A[i] * X[i];
}
})";

static const std::string sgemm_cl_noTrans_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void sgemm_cl_noTrans_fp16(const __global half* A, const __global half* B,
__global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
half c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
half a, b;
a = A[m * lda + k];
b = B[k * ldb + n];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string sgemm_cl_transA_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void sgemm_cl_transA_fp16(const __global half* A, const __global half* B,
__global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
half c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
half a, b;
a = A[k * lda + m];
b = B[k * ldb + n];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string sgemm_cl_transB_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void sgemm_cl_transB_fp16(const __global half* A, const __global half* B,
__global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
half c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
half a, b;
a = A[m * lda + k];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string sgemm_cl_transAB_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void sgemm_cl_transAB_fp16(const __global half* A, const __global half* B,
__global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
half c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
half a, b;
a = A[k * lda + m];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

static const std::string addition_cl_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) {
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
}
})";

static const std::string sscal_cl_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void sscal_cl_fp16(__global half* X, const float alpha) {
unsigned int i = get_global_id(0);
X[i] *= alpha;
})";
#endif
} // namespace nntrainer
#endif /* __BLAS_KERNEL_INTERFACE_H__ */
Loading

0 comments on commit 79a7c25

Please sign in to comment.