Skip to content

Commit

Permalink
Merge branch 'develop' into betterPowerGEMVTail
Browse files Browse the repository at this point in the history
  • Loading branch information
ChipKerchner committed Aug 15, 2024
2 parents 083faf7 + eba8615 commit a0aeba6
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 29 deletions.
13 changes: 7 additions & 6 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,20 +89,21 @@ task:
type: text/plain

macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
task:
name: AppleM1/LLVM armv7-androidndk xbuild
compile_script:
- brew install android-ndk
- brew install --cask android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- ls /System/Volumes/Data/opt/homebrew
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
- ls /opt/homebrew
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ Examples:
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
```

When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build.

### Debug version

A debug version can be built using `make DEBUG=1`.
Expand Down
28 changes: 28 additions & 0 deletions cpuid_x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,19 @@ int get_cpuname(void){
break;
case 10: //family 6 exmodel 10
switch (model) {
case 13: // Granite Rapids
if(support_amx_bf16())
return CPUTYPE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 5: // Comet Lake H and S
case 6: // Comet Lake U
case 10: // Meteor Lake
Expand Down Expand Up @@ -2352,8 +2365,22 @@ int get_coretype(void){

case 10:
switch (model) {
case 13: // Granite Rapids
if(support_amx_bf16())
return CORE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CORE_COOPERLAKE;
if(support_avx512())
return CORE_SKYLAKEX;
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
case 5: // Comet Lake H and S
case 6: // Comet Lake U
case 10: // Meteor Lake
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
Expand All @@ -2362,6 +2389,7 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 0: // Meteor Lake
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())
Expand Down
2 changes: 2 additions & 0 deletions driver/others/blas_server.c
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l
main_status[cpu] = MAIN_RUNNING1;
#endif

if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2);

//For target LOONGSON3R5, applying an offset to the buffer is essential
//for minimizing cache conflicts and optimizing performance.
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
Expand Down
10 changes: 6 additions & 4 deletions exports/gensymbol
Original file line number Diff line number Diff line change
Expand Up @@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c
# clatrs3

lapackobjs2d="$lapackobjs2d
dgelqs
dgelst
dgeqp3rk
dgeqrs
dlaqp2rk
dlaqp3rk
dlarmm
Expand All @@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d
# dlaqz4

lapackobjs2z="$lapackobjs2z
zgelqs
zgelst
zgeqp3rk
zgeqrs
zlaqp2rk
zlaqp3rk
zlatrs3
Expand All @@ -918,6 +914,7 @@ lapack_extendedprecision_objs="
"

lapack_deprecated_objsc="
cgelqs cgeqrs
cgegs cggsvd
cgegv cggsvp
cgelsx clahrd
Expand All @@ -926,13 +923,16 @@ lapack_deprecated_objsc="
"

lapack_deprecated_objsd="
dgelqs dgeqrs
dgegs dgeqpf
dgegv dggsvd
dgelsx dggsvp
dlahrd
dlatzm dtzrqf"

lapack_deprecated_objss="
sgelqs
sgeqrs
sgelsx
sgegs
sgegv
Expand All @@ -945,6 +945,8 @@ lapack_deprecated_objss="
"

lapack_deprecated_objsz="
zgelqs
zgeqrs
zgegs
zgegv
zgelsx
Expand Down
36 changes: 18 additions & 18 deletions kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,11 @@
sd $21, 40($sp)
sd $22, 48($sp)

ST $f24, 56($sp)
ST $f25, 64($sp)
ST $f26, 72($sp)
ST $f27, 80($sp)
ST $f28, 88($sp)
sdc1 $f24, 56($sp)
sdc1 $f25, 64($sp)
sdc1 $f26, 72($sp)
sdc1 $f27, 80($sp)
sdc1 $f28, 88($sp)

#if defined(TRMMKERNEL)
sd $23, 96($sp)
Expand All @@ -146,10 +146,10 @@
#endif

#ifndef __64BIT__
ST $f20,120($sp)
ST $f21,128($sp)
ST $f22,136($sp)
ST $f23,144($sp)
sdc1 $f20,120($sp)
sdc1 $f21,128($sp)
sdc1 $f22,136($sp)
sdc1 $f23,144($sp)
#endif

.align 4
Expand Down Expand Up @@ -4000,11 +4000,11 @@
ld $21, 40($sp)
ld $22, 48($sp)

LD $f24, 56($sp)
LD $f25, 64($sp)
LD $f26, 72($sp)
LD $f27, 80($sp)
LD $f28, 88($sp)
ldc1 $f24, 56($sp)
ldc1 $f25, 64($sp)
ldc1 $f26, 72($sp)
ldc1 $f27, 80($sp)
ldc1 $f28, 88($sp)

#if defined(TRMMKERNEL)
ld $23, 96($sp)
Expand All @@ -4013,10 +4013,10 @@
#endif

#ifndef __64BIT__
LD $f20,120($sp)
LD $f21,128($sp)
LD $f22,136($sp)
LD $f23,144($sp)
ldc1 $f20,120($sp)
ldc1 $f21,128($sp)
ldc1 $f22,136($sp)
ldc1 $f23,144($sp)
#endif

daddiu $sp,$sp,STACKSIZE
Expand Down
76 changes: 75 additions & 1 deletion test/compare_sgemm_sbgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "../common.h"
#define SGEMM BLASFUNC(sgemm)
#define SBGEMM BLASFUNC(sbgemm)
#define SGEMV BLASFUNC(sgemv)
#define SBGEMV BLASFUNC(sbgemv)
typedef union
{
unsigned short v;
Expand Down Expand Up @@ -187,7 +189,79 @@ main (int argc, char *argv[])
free(CC);
}

if (ret != 0)
if (ret != 0) {
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
return ret;
}

k = 1;
for (x = 1; x <= loop; x++)
{
float *A = (float *)malloc(x * x * sizeof(FLOAT));
float *B = (float *)malloc(x * sizeof(FLOAT));
float *C = (float *)malloc(x * sizeof(FLOAT));
bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits));
bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits));
float *DD = (float *)malloc(x * sizeof(FLOAT));
float *CC = (float *)malloc(x * sizeof(FLOAT));
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) ||
(DD == NULL) || (CC == NULL))
return 1;
bfloat16 atmp, btmp;
blasint one = 1;

for (j = 0; j < x; j++)
{
for (i = 0; i < x; i++)
{
A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one);
AA[j * x + i].v = atmp;
}
B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
sbstobf16_(&one, &B[j], &one, &btmp, &one);
BB[j].v = btmp;
}
for (y = 0; y < 2; y++)
{
if (y == 0) {
transA = 'N';
} else {
transA = 'T';
}

memset(CC, 0, x * sizeof(FLOAT));
memset(DD, 0, x * sizeof(FLOAT));
memset(C, 0, x * sizeof(FLOAT));

SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k);
SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k);

for (j = 0; j < x; j++)
for (i = 0; i < x; i++)
if (transA == 'N') {
DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]);
} else if (transA == 'T') {
DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]);
}

for (j = 0; j < x; j++) {
if (fabs (CC[j] - C[j]) > 1.0)
ret++;
if (fabs (CC[j] - DD[j]) > 1.0)
ret++;
}
}
free(A);
free(B);
free(C);
free(AA);
free(BB);
free(DD);
free(CC);
}

if (ret != 0)
fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret);
return ret;
}

0 comments on commit a0aeba6

Please sign in to comment.