Skip to content

Commit

Permalink
[blas/opencl] SGEMM OpenCL kernels added
Browse files Browse the repository at this point in the history
Added all possible OpenCL kernels for SGEMM
Added unit tests

Signed-off-by: Debadri Samaddar <[email protected]>
  • Loading branch information
s-debadri committed Jul 2, 2024
1 parent b9c27e9 commit d156796
Show file tree
Hide file tree
Showing 7 changed files with 505 additions and 67 deletions.
24 changes: 18 additions & 6 deletions nntrainer/layers/layer_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -681,16 +681,28 @@ std::string RunLayerContext::getKernelName(LayerKernel layerKernel) {
switch (layerKernel) {
case LayerKernel::SGEMV:
return "sgemv_cl";
case LayerKernel::DOT:
return "dot_cl";
case LayerKernel::SGEMM:
return "sgemm_cl";
case LayerKernel::SGEMV_FP16:
return "sgemv_cl_fp16";
case LayerKernel::DOT:
return "dot_cl";
case LayerKernel::DOT_FP16:
return "dot_cl_fp16";
case LayerKernel::SGEMM_FP16:
return "sgemm_cl_fp16";
case LayerKernel::SGEMM_NOTRANS:
return "sgemm_cl_noTrans";
case LayerKernel::SGEMM_NOTRANS_FP16:
return "sgemm_cl_noTrans_fp16";
case LayerKernel::SGEMM_TRANSA:
return "sgemm_cl_transA";
case LayerKernel::SGEMM_TRANSA_FP16:
return "sgemm_cl_transA_fp16";
case LayerKernel::SGEMM_TRANSB:
return "sgemm_cl_transB";
case LayerKernel::SGEMM_TRANSB_FP16:
return "sgemm_cl_transB_fp16";
case LayerKernel::SGEMM_TRANSAB:
return "sgemm_cl_transAB";
case LayerKernel::SGEMM_TRANSAB_FP16:
return "sgemm_cl_transAB_fp16";
case LayerKernel::ADD:
return "addition_cl";
case LayerKernel::ADD_FP16:
Expand Down
30 changes: 18 additions & 12 deletions nntrainer/layers/layer_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -830,18 +830,24 @@ class RunLayerContext {
* getKernelName function.
*/
enum LayerKernel {
SGEMV = 1 << 0, /**< placeholder for kernel name */
DOT = 1 << 1, /**< placeholder for kernel name */
SGEMM = 1 << 2, /**< placeholder for kernel name */
SGEMV_FP16 = 1 << 3, /**< placeholder for kernel name */
DOT_FP16 = 1 << 4, /**< placeholder for kernel name */
SGEMM_FP16 = 1 << 5, /**< placeholder for kernel name */
ADD = 1 << 6, /**< placeholder for kernel name */
ADD_FP16 = 1 << 7, /**< placeholder for kernel name */
SWIGLU = 1 << 8, /**< placeholder for kernel name */
SWIGLU_FP16 = 1 << 9, /**< placeholder for kernel name */
SSCAL = 1 << 10, /**< placeholder for kernel name */
SSCAL_FP16 = 1 << 11, /**< placeholder for kernel name */
SGEMV = 1 << 0, /**< placeholder for kernel name */
SGEMV_FP16 = 1 << 1, /**< placeholder for kernel name */
DOT = 1 << 2, /**< placeholder for kernel name */
DOT_FP16 = 1 << 3, /**< placeholder for kernel name */
SGEMM_NOTRANS = 1 << 4, /**< placeholder for kernel name */
SGEMM_NOTRANS_FP16 = 1 << 5, /**< placeholder for kernel name */
SGEMM_TRANSA = 1 << 6, /**< placeholder for kernel name */
SGEMM_TRANSA_FP16 = 1 << 7, /**< placeholder for kernel name */
SGEMM_TRANSB = 1 << 8, /**< placeholder for kernel name */
SGEMM_TRANSB_FP16 = 1 << 9, /**< placeholder for kernel name */
SGEMM_TRANSAB = 1 << 10, /**< placeholder for kernel name */
SGEMM_TRANSAB_FP16 = 1 << 11, /**< placeholder for kernel name */
ADD = 1 << 12, /**< placeholder for kernel name */
ADD_FP16 = 1 << 13, /**< placeholder for kernel name */
SWIGLU = 1 << 14, /**< placeholder for kernel name */
SWIGLU_FP16 = 1 << 15, /**< placeholder for kernel name */
SSCAL = 1 << 16, /**< placeholder for kernel name */
SSCAL_FP16 = 1 << 17, /**< placeholder for kernel name */
};

/**
Expand Down
10 changes: 4 additions & 6 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,8 @@ void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
}
/// case others: use gemm
else {
// transA == false, transB == false
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
// todo: other condition implementations
sgemm_cl(transA, transB, data, mdata, rdata, M, N, K, lda, ldb, ldc,
context);
}
} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
Expand Down Expand Up @@ -184,9 +183,8 @@ void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
}
/// case others: use sgemm
else {
// transA == false, transB == false
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
// todo: other condition implementations
sgemm_cl(transA, transB, data, mdata, rdata, M, N, K, lda, ldb, ldc,
context);
}
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
Expand Down
111 changes: 95 additions & 16 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ std::string dot_cl_kernel_ =
}
})";

std::string sgemm_cl_kernel_ =
R"(__kernel void sgemm_cl(const __global float* A, const __global float* B,
std::string sgemm_cl_noTrans_kernel_ =
R"(__kernel void sgemm_cl_noTrans(const __global float* A, const __global float* B,
__global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
Expand All @@ -51,6 +51,58 @@ std::string sgemm_cl_kernel_ =
C[m * ldc + n] = c;
})";

std::string sgemm_cl_transA_kernel_ =
R"(__kernel void sgemm_cl_transA(const __global float* A, const __global float* B,
__global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[k * lda + m];
b = B[k * ldb + n];
c += a * b;
}
C[m * ldc + n] = c;
})";

std::string sgemm_cl_transB_kernel_ =
R"(__kernel void sgemm_cl_transB(const __global float *A, const __global float *B,
__global float *C, unsigned int K,
unsigned int lda, unsigned int ldb,
unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[m * lda + k];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

std::string sgemm_cl_transAB_kernel_ =
R"(__kernel void sgemm_cl_transAB(const __global float *A, const __global float *B,
__global float *C, unsigned int K,
unsigned int lda, unsigned int ldb,
unsigned int ldc) {
unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[k * lda + m];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
#pragma printf_support
Expand All @@ -71,7 +123,10 @@ std::string sscal_cl_kernel_ =
* @brief defining global kernel objects
*/
opencl::Kernel kernel_sgemv;
opencl::Kernel kernel_sgemm;
opencl::Kernel kernel_sgemm_transAB;
opencl::Kernel kernel_sgemm_transA;
opencl::Kernel kernel_sgemm_transB;
opencl::Kernel kernel_sgemm_noTrans;
opencl::Kernel kernel_dot;
opencl::Kernel kernel_addition;
opencl::Kernel kernel_sscal;
Expand Down Expand Up @@ -227,19 +282,43 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
return cl_ret;
}

void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context) {
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const float *A,
const float *B, float *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context) {

opencl::Kernel *kernel_sgemm = nullptr;
RunLayerContext::LayerKernel layerKernel;
std::string sgemm_cl_kernel_;

if (TransA != CblasTrans && TransB != CblasTrans) {
kernel_sgemm = &kernel_sgemm_noTrans;
layerKernel = context.LayerKernel::SGEMM_NOTRANS;
sgemm_cl_kernel_ = sgemm_cl_noTrans_kernel_;
} else if (TransA == CblasTrans && TransB != CblasTrans) {
kernel_sgemm = &kernel_sgemm_transA;
layerKernel = context.LayerKernel::SGEMM_TRANSA;
sgemm_cl_kernel_ = sgemm_cl_transA_kernel_;
} else if (TransA != CblasTrans && TransB == CblasTrans) {
kernel_sgemm = &kernel_sgemm_transB;
layerKernel = context.LayerKernel::SGEMM_TRANSB;
sgemm_cl_kernel_ = sgemm_cl_transB_kernel_;
} else {
kernel_sgemm = &kernel_sgemm_transAB;
layerKernel = context.LayerKernel::SGEMM_TRANSAB;
sgemm_cl_kernel_ = sgemm_cl_transAB_kernel_;
}

bool result = false;

do {
result = context.clCreateKernel(sgemm_cl_kernel_,
context.LayerKernel::SGEMM, kernel_sgemm);
result =
context.clCreateKernel(sgemm_cl_kernel_, layerKernel, *kernel_sgemm);
if (!result) {
break;
}

// sizes will be same for transpose
size_t m_k_size = M * K * sizeof(float);
size_t k_n_size = K * N * sizeof(float);
size_t m_n_size = M * N * sizeof(float);
Expand All @@ -265,37 +344,37 @@ void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
break;
}

result = kernel_sgemm.SetKernelArguments(0, &inputA, sizeof(cl_mem));
result = kernel_sgemm->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(1, &inputB, sizeof(cl_mem));
result = kernel_sgemm->SetKernelArguments(1, &inputB, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(2, &inOutC, sizeof(cl_mem));
result = kernel_sgemm->SetKernelArguments(2, &inOutC, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(3, &K, sizeof(int));
result = kernel_sgemm->SetKernelArguments(3, &K, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(4, &lda, sizeof(int));
result = kernel_sgemm->SetKernelArguments(4, &lda, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(5, &ldb, sizeof(int));
result = kernel_sgemm->SetKernelArguments(5, &ldb, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(6, &ldc, sizeof(int));
result = kernel_sgemm->SetKernelArguments(6, &ldc, sizeof(int));
if (!result) {
break;
}
Expand All @@ -304,7 +383,7 @@ void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
const int work_group_size[3] = {32, 32, 1}; // test-value

result = context.command_queue_inst_.DispatchCommand(
kernel_sgemm, work_groups_count, work_group_size);
*kernel_sgemm, work_groups_count, work_group_size);
if (!result) {
break;
}
Expand Down
28 changes: 20 additions & 8 deletions nntrainer/tensor/cl_operations/blas_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ namespace nntrainer {
* @brief declaring global kernel objects
*/
extern opencl::Kernel kernel_sgemv;
extern opencl::Kernel kernel_sgemm;
extern opencl::Kernel kernel_sgemm_noTrans;
extern opencl::Kernel kernel_sgemm_transAB;
extern opencl::Kernel kernel_sgemm_transA;
extern opencl::Kernel kernel_sgemm_transB;
extern opencl::Kernel kernel_dot;
extern opencl::Kernel kernel_addition;
extern opencl::Kernel kernel_sscal;
Expand Down Expand Up @@ -58,6 +61,8 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
/**
* @brief sgemm computation : Y = op(A)*op(B) + C,
* where op(X) is one of X or X**T
* @param[in] transA CBLAS_TRANSPOSE
* @param[in] transB CBLAS_TRANSPOSE
* @param[in] A float * for Matrix A
* @param[in] B float * for Matrix B
* @param[in] C float * for Matrix C
Expand All @@ -69,9 +74,10 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
* @param[in] ldc number of C's columns
* @param[in] context RunLayerContext reference
*/
void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context);
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const float *A,
const float *B, float *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context);

/**
* @brief addition : sum of all input vectors
Expand All @@ -98,7 +104,10 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
* @brief declaring global fp16 kernel objects
*/
extern opencl::Kernel kernel_sgemv_fp16;
extern opencl::Kernel kernel_sgemm_fp16;
extern opencl::Kernel kernel_sgemm_noTrans_fp16;
extern opencl::Kernel kernel_sgemm_transAB_fp16;
extern opencl::Kernel kernel_sgemm_transA_fp16;
extern opencl::Kernel kernel_sgemm_transB_fp16;
extern opencl::Kernel kernel_dot_fp16;
extern opencl::Kernel kernel_addition_fp16;
extern opencl::Kernel kernel_sscal_fp16;
Expand Down Expand Up @@ -131,6 +140,8 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
/**
* @brief fp16 sgemm computation : Y = op(A)*op(B) + C,
* where op(X) is one of X or X**T
* @param[in] transA CBLAS_TRANSPOSE
* @param[in] transB CBLAS_TRANSPOSE
* @param[in] A fp16 * for Matrix A
* @param[in] B fp16 * for Matrix B
* @param[in] C fp16 * for Matrix C
Expand All @@ -142,9 +153,10 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
* @param[in] ldc number of C's columns
* @param[in] context RunLayerContext reference
*/
void sgemm_cl(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context);
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const __fp16 *A,
const __fp16 *B, __fp16 *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context);

/**
* @brief fp16 addition : sum of all input vectors
Expand Down
Loading

0 comments on commit d156796

Please sign in to comment.