Skip to content

Commit

Permalink
[ Tensor ] Remove CBLAS params from Tensor related files.
Browse files Browse the repository at this point in the history
- Remove cblas params from tensor related files since nntrainer is not fully-dependent on cblas anymore.
- Letting tensors to be aware of Cblas related parameters is a nonsense at the first place.
- CBLAS params will be declared only when functions from cblas is called.

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Aug 12, 2024
1 parent 3b11453 commit c952f2f
Show file tree
Hide file tree
Showing 8 changed files with 186 additions and 190 deletions.
170 changes: 92 additions & 78 deletions nntrainer/tensor/blas_interface.cpp

Large diffs are not rendered by default.

27 changes: 6 additions & 21 deletions nntrainer/tensor/blas_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,6 @@
#define __BLAS_INTERFACE_H_
#ifdef __cplusplus

#ifdef USE_BLAS
extern "C" {
#include <cblas.h>
}
#else
enum CBLAS_ORDER { CblasRowMajor = 101, CblasColMajor = 102 };

enum CBLAS_TRANSPOSE {
CblasNoTrans = 111,
CblasTrans = 112,
CblasConjTrans = 113
};

#endif

#ifdef USE_CUBLAS
#include <helper_cuda.h>
#include <helper_functions.h>
Expand Down Expand Up @@ -132,7 +117,7 @@ void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
* @param[in] alpha float number
* @param[in] beta float number
*/
void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
const unsigned int M, const unsigned int N, const unsigned int K,
const float alpha, const _FP16 *A, const unsigned int lda,
const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
Expand All @@ -147,7 +132,7 @@ void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
* @param[in] alpha float number
* @param[in] beta float number
*/
void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
const unsigned int N, const float alpha, const _FP16 *A,
const unsigned int lda, const _FP16 *X, const int incX,
const float beta, _FP16 *Y, const int incY);
Expand Down Expand Up @@ -346,7 +331,7 @@ void saxpy(const unsigned int N, const float alpha, const float *X,
* @param[in] alpha float number
* @param[in] beta float number
*/
void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
const unsigned int M, const unsigned int N, const unsigned int K,
const float alpha, const void *A, const unsigned int lda,
const void *B, const unsigned int ldb, const float beta, void *C,
Expand All @@ -363,7 +348,7 @@ void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
* @param[in] alpha float number
* @param[in] beta float number
*/
void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
const unsigned int M, const unsigned int N, const unsigned int K,
const float alpha, const float *A, const unsigned int lda,
const float *B, const unsigned int ldb, const float beta, float *C,
Expand All @@ -378,7 +363,7 @@ void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
* @param[in] alpha float number
* @param[in] beta float number
*/
void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
const unsigned int N, const float alpha, const void *A,
const unsigned int lda, const void *X, const int incX,
const float beta, void *Y, const int incY,
Expand All @@ -393,7 +378,7 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
* @param[in] alpha float number
* @param[in] beta float number
*/
void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
const unsigned int N, const float alpha, const float *A,
const unsigned int lda, const float *X, const int incX,
const float beta, float *Y, const int incY);
Expand Down
26 changes: 10 additions & 16 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,6 @@ void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
const float *data = input.getData();
const float *mdata = m.getData();
float *rdata = result.getData();
enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;

/// shortcut handling in case of vector
/// for vector, (1 * K) == (K * 1) in current memory layout...
Expand All @@ -134,29 +132,26 @@ void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
}
/// case2: (M * K) X (K * 1)
else if (N == 1) {
transA ? sgemv_cl(data, mdata, rdata, dim2, dim1, lda, context)
: sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
trans ? sgemv_cl(data, mdata, rdata, dim2, dim1, lda, context)
: sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
}
/// case3: (1 * K) X (K * N) = 1 * N = R
/// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
/// Effectively a translation of sgemv
else if (M == 1) {
transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
transB ? sgemv_cl(mdata, data, rdata, mdim2, mdim1, ldb, context)
: sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
trans_m ? sgemv_cl(mdata, data, rdata, mdim2, mdim1, ldb, context)
: sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
}
/// case others: use gemm
else {
sgemm_cl(transA, transB, data, mdata, rdata, M, N, K, lda, ldb, ldc,
sgemm_cl(trans, trans_m, data, mdata, rdata, M, N, K, lda, ldb, ldc,
context);
}
} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
const _FP16 *data = input.getData<_FP16>();
const _FP16 *mdata = m.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;

/// shortcut handling in case of vector
/// for vector, (1 * K) == (K * 1) in current memory layout...
Expand All @@ -170,20 +165,19 @@ void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
}
/// case2: (M * K) X (K * 1)
else if (N == 1) {
transA ? sgemv_cl(data, mdata, rdata, dim2, dim1, lda, context)
: sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
trans ? sgemv_cl(data, mdata, rdata, dim2, dim1, lda, context)
: sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
}
/// case3: (1 * K) X (K * N) = 1 * N = R
/// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
/// Effectively a translation of sgemv
else if (M == 1) {
transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
transB ? sgemv_cl(mdata, data, rdata, mdim2, mdim1, ldb, context)
: sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
trans_m ? sgemv_cl(mdata, data, rdata, mdim2, mdim1, ldb, context)
: sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
}
/// case others: use sgemm
else {
sgemm_cl(transA, transB, data, mdata, rdata, M, N, K, lda, ldb, ldc,
sgemm_cl(trans, trans_m, data, mdata, rdata, M, N, K, lda, ldb, ldc,
context);
}
#else
Expand Down
14 changes: 7 additions & 7 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,24 +282,24 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
return cl_ret;
}

void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const float *A,
const float *B, float *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context) {
void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
float *C, unsigned int M, unsigned int N, unsigned int K,
unsigned int lda, unsigned int ldb, unsigned int ldc,
RunLayerContext &context) {

opencl::Kernel *kernel_sgemm = nullptr;
RunLayerContext::LayerKernel layerKernel;
std::string sgemm_cl_kernel_;

if (TransA != CblasTrans && TransB != CblasTrans) {
if (!TransA && !TransB) {
kernel_sgemm = &kernel_sgemm_noTrans;
layerKernel = context.LayerKernel::SGEMM_NOTRANS;
sgemm_cl_kernel_ = sgemm_cl_noTrans_kernel_;
} else if (TransA == CblasTrans && TransB != CblasTrans) {
} else if (TransA && !TransB) {
kernel_sgemm = &kernel_sgemm_transA;
layerKernel = context.LayerKernel::SGEMM_TRANSA;
sgemm_cl_kernel_ = sgemm_cl_transA_kernel_;
} else if (TransA != CblasTrans && TransB == CblasTrans) {
} else if (!TransA && TransB) {
kernel_sgemm = &kernel_sgemm_transB;
layerKernel = context.LayerKernel::SGEMM_TRANSB;
sgemm_cl_kernel_ = sgemm_cl_transB_kernel_;
Expand Down
24 changes: 12 additions & 12 deletions nntrainer/tensor/cl_operations/blas_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
/**
* @brief sgemm computation : Y = op(A)*op(B) + C,
* where op(X) is one of X or X**T
* @param[in] transA CBLAS_TRANSPOSE
* @param[in] transB CBLAS_TRANSPOSE
* @param[in] transA bool transpose
* @param[in] transB bool transpose
* @param[in] A float * for Matrix A
* @param[in] B float * for Matrix B
* @param[in] C float * for Matrix C
Expand All @@ -74,10 +74,10 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
* @param[in] ldc number of C's columns
* @param[in] context RunLayerContext reference
*/
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const float *A,
const float *B, float *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context);
void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
float *C, unsigned int M, unsigned int N, unsigned int K,
unsigned int lda, unsigned int ldb, unsigned int ldc,
RunLayerContext &context);

/**
* @brief addition : sum of all input vectors
Expand Down Expand Up @@ -140,8 +140,8 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
/**
* @brief fp16 sgemm computation : Y = op(A)*op(B) + C,
* where op(X) is one of X or X**T
* @param[in] transA CBLAS_TRANSPOSE
* @param[in] transB CBLAS_TRANSPOSE
* @param[in] transA bool transpose
* @param[in] transB bool transpose
* @param[in] A fp16 * for Matrix A
* @param[in] B fp16 * for Matrix B
* @param[in] C fp16 * for Matrix C
Expand All @@ -153,10 +153,10 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
* @param[in] ldc number of C's columns
* @param[in] context RunLayerContext reference
*/
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const __fp16 *A,
const __fp16 *B, __fp16 *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context);
void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
__fp16 *C, unsigned int M, unsigned int N, unsigned int K,
unsigned int lda, unsigned int ldb, unsigned int ldc,
RunLayerContext &context);

/**
* @brief fp16 addition : sum of all input vectors
Expand Down
14 changes: 7 additions & 7 deletions nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,24 +302,24 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
return cl_ret;
}

void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const __fp16 *A,
const __fp16 *B, __fp16 *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context) {
void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
__fp16 *C, unsigned int M, unsigned int N, unsigned int K,
unsigned int lda, unsigned int ldb, unsigned int ldc,
RunLayerContext &context) {

opencl::Kernel *kernel_sgemm_fp16 = nullptr;
RunLayerContext::LayerKernel layerKernel;
std::string sgemm_cl_kernel_fp16_;

if (TransA != CblasTrans && TransB != CblasTrans) {
if (!TransA && !TransB) {
kernel_sgemm_fp16 = &kernel_sgemm_noTrans_fp16;
layerKernel = context.LayerKernel::SGEMM_NOTRANS_FP16;
sgemm_cl_kernel_fp16_ = sgemm_cl_noTrans_kernel_fp16_;
} else if (TransA == CblasTrans && TransB != CblasTrans) {
} else if (TransA && !TransB) {
kernel_sgemm_fp16 = &kernel_sgemm_transA_fp16;
layerKernel = context.LayerKernel::SGEMM_TRANSA_FP16;
sgemm_cl_kernel_fp16_ = sgemm_cl_transA_kernel_fp16_;
} else if (TransA != CblasTrans && TransB == CblasTrans) {
} else if (!TransA && TransB) {
kernel_sgemm_fp16 = &kernel_sgemm_transB_fp16;
layerKernel = context.LayerKernel::SGEMM_TRANSB_FP16;
sgemm_cl_kernel_fp16_ = sgemm_cl_transB_kernel_fp16_;
Expand Down
Loading

0 comments on commit c952f2f

Please sign in to comment.