Skip to content

Commit

Permalink
[ hgemm/trivial ] Use aligned memory allocation in K1 transpose non_M…
Browse files Browse the repository at this point in the history
…8_case

- Since K1 GEMM does not use data packing, I did not use aligned memory allocation.
- However, for SIMD situation, using such is more preferred.

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Jun 28, 2024
1 parent a2d0536 commit 9651619
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions nntrainer/tensor/hgemm/hgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void hgemm_K1_transA(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc, float alpha,
float beta) {
__fp16 *A_T = new __fp16[M * K];
__fp16 *A_T = alignedMalloc(M * K);

transpose_neon<__fp16>(K, M, A, M, A_T, K);

Expand All @@ -119,7 +119,7 @@ void hgemm_K1_transB(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc, float alpha,
float beta) {
__fp16 *B_T = new __fp16[K * N];
__fp16 *B_T = alignedMalloc(K * N);

transpose_neon<__fp16>(N, K, B, K, B_T, N);

Expand All @@ -132,8 +132,8 @@ void hgemm_K1_transAB(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc,
float alpha, float beta) {
__fp16 *A_T = new __fp16[M * K];
__fp16 *B_T = new __fp16[K * N];
__fp16 *A_T = alignedMalloc(M * K);
__fp16 *B_T = alignedMalloc(K * N);

transpose_neon<__fp16>(K, M, A, M, A_T, K);
transpose_neon<__fp16>(N, K, B, K, B_T, N);
Expand Down

0 comments on commit 9651619

Please sign in to comment.