Skip to content

Commit

Permalink
[ hgemm ] Consider K=1 changes
Browse files Browse the repository at this point in the history
- Current implementation is rooted on general cases, thus optimize only w.r.t. K accumulation.
- However, when it comes to M,1 x 1,N computation, all optimizations like packing, transposing is no use.
- Implementing a explicit kernel function for such case resolved the latency issue.

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Jun 27, 2024
1 parent ed2d27f commit bfc0160
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 1 deletion.
4 changes: 3 additions & 1 deletion nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1588,7 +1588,9 @@ unsigned int isamax(const unsigned int N, const __fp16 *X) {

void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, uint32_t N,
uint32_t K, float alpha, float beta, bool TransA, bool TransB) {

if (K == 1) {
return hgemm_K1(M, N, K, A, K, B, N, C, N, alpha, beta);
}
// dynamic creation to avoid reaching stack limit(causes segmentation fault)
float *C32 = (float *)malloc(M * N * sizeof(float));

Expand Down
17 changes: 17 additions & 0 deletions nntrainer/tensor/hgemm/hgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,23 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
}
}

void hgemm_K1(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc,
float alpha, float beta) {
float16x8_t a_vec;
unsigned int N8 = (N >> 3) << 3;
for (unsigned int m = 0; m < M; ++m) {
a_vec = vmovq_n_f16(A[m]);
for (unsigned int n = 0; n < N8; n += 8) {
vst1q_f16(&C[m * ldc + n], vmulq_f16(a_vec, vld1q_f16(&B[n])));
}
for (unsigned int n = N8; n < N; ++n) {
C[m * ldc + n] = A[m] * B[n];
}
}
}

void hgemm_noTrans_1x4(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc,
Expand Down
19 changes: 19 additions & 0 deletions nntrainer/tensor/hgemm/hgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,25 @@ void hgemm_noTrans_fallback(unsigned int M, unsigned int N, unsigned int K,
unsigned int ldb, float *C, unsigned int ldc,
float alpha = 1.F, float beta = 0.F);

/**
* @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
* @param M length of the row of matrix A
* @param N length of the col of matrix B
* @param K length of the col of matrix A
* @param A input matrix A
* @param lda length of the col of matrix A
* @param B input matrix B
* @param ldb length of the col of matrix B
* @param C output matrix C
* @param ldc length of the col of matrix C
* @param[in] alpha float number
* @param[in] beta float number
*/
void hgemm_K1(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc,
float alpha = 1.F, float beta = 0.F);

/**
* @brief hgemm noTrans computation with 1x4 kernel : C = A*B,
*
Expand Down

0 comments on commit bfc0160

Please sign in to comment.