From 48698b2b1d575cd4e10b5667e9dc5bd2fca1cbf2 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 18 Sep 2024 17:20:43 +0800 Subject: [PATCH] LoongArch64: Rename core Use microarchitecture name instead of meaningless strings to name the core, the legacy core is still retained. 1. Rename LOONGSONGENERIC to LA64_GENERIC 2. Rename LOONGSON3R5 to LA464 3. Rename LOONGSON2K1000 to LA264 --- .github/workflows/loongarch64.yml | 9 + .github/workflows/loongarch64_clang.yml | 6 + Makefile.system | 2 +- TargetList.txt | 10 +- cpuid_loongarch64.c | 406 +++++++++++++++--- driver/others/blas_server.c | 2 +- driver/others/dynamic_loongarch64.c | 104 ++++- driver/others/parameter.c | 2 +- getarch.c | 79 +++- interface/gemm.c | 2 +- .../{KERNEL.LOONGSON2K1000 => KERNEL.LA264} | 0 .../{KERNEL.LOONGSON3R5 => KERNEL.LA464} | 0 kernel/setparam-ref.c | 2 +- param.h | 6 +- 14 files changed, 506 insertions(+), 124 deletions(-) rename kernel/loongarch64/{KERNEL.LOONGSON2K1000 => KERNEL.LA264} (100%) rename kernel/loongarch64/{KERNEL.LOONGSON3R5 => KERNEL.LA464} (100%) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index da7f6c9a0c..69379e0500 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -23,6 +23,15 @@ jobs: - target: LOONGSON2K1000 triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 + - target: LA64_GENERIC + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC + - target: LA464 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 + - target: LA264 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 - target: DYNAMIC_ARCH triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml index d08e56f627..f1a75ad343 100644 --- a/.github/workflows/loongarch64_clang.yml +++ b/.github/workflows/loongarch64_clang.yml @@ -20,6 +20,12 @@ jobs: opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 - target: LOONGSON2K1000 opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 + - target: LA64_GENERIC + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC + - target: LA464 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 + - target: LA264 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 - target: DYNAMIC_ARCH opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC diff --git a/Makefile.system b/Makefile.system index c40c1f2340..908e65dab0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -727,7 +727,7 @@ endif endif ifeq ($(ARCH), loongarch64) -DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC +DYNAMIC_CORE = LA64_GENERIC LA264 LA464 endif ifeq ($(ARCH), riscv64) diff --git a/TargetList.txt b/TargetList.txt index 1531fd0d2f..25eeddfb00 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -126,9 +126,17 @@ x280 RISCV64_ZVL256B 11.LOONGARCH64: +// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names, +// and it is recommended to use the more standardized naming conventions +// LA64_GENERIC/LA264/LA464. You can still specify TARGET as +// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime, +// and they will be internally relocated to LA64_GENERIC/LA264/LA464. LOONGSONGENERIC -LOONGSON3R5 LOONGSON2K1000 +LOONGSON3R5 +LA64_GENERIC +LA264 +LA464 12. Elbrus E2000: E2K diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index 3b7a9c82ea..c6ce2bb731 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2020, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -32,53 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include -#include #include +#include +#include +#include -/* If LASX extension instructions supported, - * using core LOONGSON3R5 - * If only LSX extension instructions supported, - * using core LOONGSON2K1000 - * If neither LASX nor LSX extension instructions supported, - * using core LOONGSONGENERIC (As far as I know, there is no such - * CPU yet) - */ +#define CPU_LA64_GENERIC 0 +#define CPU_LA264 1 +#define CPU_LA364 2 +#define CPU_LA464 3 +#define CPU_LA664 4 -#define CPU_GENERIC 0 -#define CPU_LOONGSON3R5 1 -#define CPU_LOONGSON2K1000 2 +#define CORE_LA64_GENERIC 0 +#define CORE_LA264 1 +#define CORE_LA464 2 #define LA_HWCAP_LSX (1U << 4) #define LA_HWCAP_LASX (1U << 5) +#define LOONGARCH_CFG0 0x00 +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_CFG10 0x10 +#define LOONGARCH_CFG11 0x11 +#define LOONGARCH_CFG12 0x12 +#define LOONGARCH_CFG13 0x13 +#define LOONGARCH_CFG14 0x14 +#define LASX_MASK 1<<7 +#define LSX_MASK 1<<6 +#define PRID_SERIES_MASK 0xf000 +#define PRID_SERIES_LA264 0xa000 +#define PRID_SERIES_LA364 0xb000 +#define PRID_SERIES_LA464 0xc000 +#define PRID_SERIES_LA664 0xd000 + +#define CACHE_INFO_L1_IU 0 +#define CACHE_INFO_L1_D 1 +#define CACHE_INFO_L2_IU 2 +#define CACHE_INFO_L2_D 3 +#define CACHE_INFO_L3_IU 4 +#define CACHE_INFO_L3_D 5 +#define L1_IU_PRESENT_MASK 0x0001 +#define L1_IU_UNITY_MASK 0x0002 +#define L1_D_PRESENT_MASK 0x0004 +#define L2_IU_PRESENT_MASK 0x0008 +#define L2_IU_UNITY_MASK 0x0010 +#define L2_D_PRESENT_MASK 0x0080 +#define L3_IU_PRESENT_MASK 0x0400 +#define L3_IU_UNITY_MASK 0x0800 +#define L3_D_PRESENT_MASK 0x4000 +#define CACHE_WAY_MINUS_1_MASK 0x0000ffff +#define CACHE_INDEX_LOG2_MASK 0x00ff0000 +#define CACHE_LINESIZE_LOG2_MASK 0x7f000000 + +typedef struct { + int size; + int associative; + int linesize; + int unify; + int present; +} cache_info_t; + +/* Using microarchitecture representation */ static char *cpuname[] = { - "LOONGSONGENERIC", - "LOONGSON3R5", - "LOONGSON2K1000" + "LA64_GENERIC", + "LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */ + "LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */ + "LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */ + "LA664" /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */ }; static char *cpuname_lower[] = { - "loongsongeneric", - "loongson3r5", - "loongson2k1000" + "la64_generic", + "la264", + "la364", + "la464", + "la664" +}; + +static char *corename[] = { + "LA64_GENERIC", /* Implies using scalar instructions for optimization */ + "LA264", /* Implies using LSX instructions for optimization */ + "LA464", /* Implies using LASX instructions for optimization */ +}; + +static char *corename_lower[] = { + "la64_generic", + "la264", + "la464", }; -int detect(void) { -#ifdef __linux +/* + * Obtain cache and processor identification + * through the cpucfg command. + */ +static void get_cacheinfo(int type, cache_info_t *cacheinfo) { + cache_info_t cache_info; + memset(&cache_info, 0, sizeof(cache_info)); + uint32_t reg_10 = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_10) + : "r"(LOONGARCH_CFG10) + ); + + switch (type) { + case CACHE_INFO_L1_IU: + if (reg_10 & L1_IU_PRESENT_MASK) { + uint32_t reg_11 = 0; + cache_info.present = reg_10 & L1_IU_PRESENT_MASK; + cache_info.unify = reg_10 & L1_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_11) + : "r"(LOONGARCH_CFG11) + ); + cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L1_D: + if (reg_10 & L1_D_PRESENT_MASK) { + uint32_t reg_12 = 0; + cache_info.present = reg_10 & L1_D_PRESENT_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_12) + : "r"(LOONGARCH_CFG12) + ); + cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L2_IU: + if (reg_10 & L2_IU_PRESENT_MASK) { + uint32_t reg_13 = 0; + cache_info.present = reg_10 & L2_IU_PRESENT_MASK; + cache_info.unify = reg_10 & L2_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_13) + : "r"(LOONGARCH_CFG13) + ); + cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L2_D: + if (reg_10 & L2_D_PRESENT_MASK) { + cache_info.present = reg_10 & L2_D_PRESENT_MASK; + // No date fetch + } + break; + + case CACHE_INFO_L3_IU: + if (reg_10 & L3_IU_PRESENT_MASK) { + uint32_t reg_14 = 0; + cache_info.present = reg_10 & L3_IU_PRESENT_MASK; + cache_info.unify = reg_10 & L3_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_14) + : "r"(LOONGARCH_CFG14) + ); + cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L3_D: + if (reg_10 & L3_D_PRESENT_MASK) { + cache_info.present = reg_10 & L3_D_PRESENT_MASK; + // No data fetch + } + break; + + default: + break; + } + *cacheinfo = cache_info; +} + +static uint32_t get_prid() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG0) + ); + return reg; +} + +static void get_cpucount(uint32_t *count) { + uint32_t num = 0; + FILE *f = fopen("/proc/cpuinfo", "r"); + if (!f) return; + char buf[200]; + while (fgets(buf, sizeof(buf), f)) + { + if (!strncmp("processor", buf, 9)) + num ++; + } + fclose(f); + *count = num; +} + +/* Detect whether the OS supports the LASX instruction set */ +static int os_support_lasx() { int hwcap = (int)getauxval(AT_HWCAP); if (hwcap & LA_HWCAP_LASX) - return CPU_LOONGSON3R5; - else if (hwcap & LA_HWCAP_LSX) - return CPU_LOONGSON2K1000; + return 1; + else + return 0; +} + +/* Detect whether the OS supports the LSX instruction set */ +static int os_support_lsx() { + int hwcap = (int)getauxval(AT_HWCAP); + + if (hwcap & LA_HWCAP_LSX) + return 1; else - return CPU_GENERIC; -#endif - return CPU_GENERIC; + return 0; +} + +int get_coretype(void) { + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA464): + case (PRID_SERIES_LA664): + if (os_support_lasx()) + return CORE_LA464; + else if (os_support_lsx()) + return CORE_LA264; + else + return CORE_LA64_GENERIC; + break; + + case (PRID_SERIES_LA264): + case (PRID_SERIES_LA364): + if (os_support_lsx()) + return CORE_LA264; + else + return CORE_LA64_GENERIC; + break; + + default: + return CORE_LA64_GENERIC; + break; + } +} + +int get_cputype(void) { + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA264): + return CPU_LA264; + break; + + case (PRID_SERIES_LA364): + return CPU_LA364; + break; + + case (PRID_SERIES_LA464): + return CPU_LA464; + break; + + case (PRID_SERIES_LA664): + return CPU_LA664; + break; + + default: + return CPU_LA64_GENERIC; + break; + } } char *get_corename(void) { - return cpuname[detect()]; + return corename[get_coretype()]; +} + +void get_libname(void){ + printf("%s", corename_lower[get_coretype()]); } void get_architecture(void) { @@ -86,8 +332,7 @@ void get_architecture(void) { } void get_subarchitecture(void) { - int d = detect(); - printf("%s", cpuname[d]); + printf("%s", cpuname[get_cputype()]); } void get_subdirname(void) { @@ -95,50 +340,69 @@ void get_subdirname(void) { } void get_cpuconfig(void) { - uint32_t hwcaps = 0; - int d = detect(); - - switch (d) { - case CPU_LOONGSON3R5: - printf("#define LOONGSON3R5\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 1048576\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + cache_info_t info; + uint32_t num_cores = 0; - case CPU_LOONGSON2K1000: - printf("#define LOONGSON2K1000\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + printf("#define %s\n", corename[get_coretype()]); // Core name - default: - printf("#define LOONGSONGENERIC\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name + + get_cacheinfo(CACHE_INFO_L1_IU, &info); + if (info.present) { + if (info.unify) { // Unified cache, without distinguishing between instructions and data + printf("#define L1_SIZE %d\n", info.size); + printf("#define L1_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_LINESIZE %d\n", info.linesize); + } else { + printf("#define L1_CODE_SIZE %d\n", info.size); + printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_CODE_LINESIZE %d\n", info.linesize); + } } - hwcaps = (uint32_t)getauxval( AT_HWCAP ); - if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); - if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); -} + if (!info.unify) { + get_cacheinfo(CACHE_INFO_L1_D, &info); + if (info.present) { + printf("#define L1_DATA_SIZE %d\n", info.size); + printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_DATA_LINESIZE %d\n", info.linesize); + } + } -void get_libname(void){ - int d = detect(); - printf("%s", cpuname_lower[d]); + get_cacheinfo(CACHE_INFO_L2_IU, &info); + if (info.present > 0) { + if (info.unify) { + printf("#define L2_SIZE %d\n", info.size); + printf("#define L2_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_LINESIZE %d\n", info.linesize); + } else { + printf("#define L2_CODE_SIZE %d\n", info.size); + printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_CODE_LINESIZE %d\n", info.linesize); + } + } + + get_cacheinfo(CACHE_INFO_L3_IU, &info); + if (info.present > 0) { + if (info.unify) { + printf("#define L3_SIZE %d\n", info.size); + printf("#define L3_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_LINESIZE %d\n", info.linesize); + } else { + printf("#define L3_CODE_SIZE %d\n", info.size); + printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_CODE_LINESIZE %d\n", info.linesize); + } + } + + if(os_support_lsx) printf("#define HAVE_LSX\n"); + if(os_support_lasx) printf("#define HAVE_LASX\n"); + + get_cpucount(&num_cores); + if (num_cores) + printf("#define NUM_CORES %d\n", num_cores); + + //TODO: It’s unclear what this entry represents, but it is indeed necessary. + //It has been set based on reference to other platforms. + printf("#define DTB_DEFAULT_ENTRIES 64\n"); } diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 29f8a5e646..7306a3ecd8 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1082,7 +1082,7 @@ if (buffer == NULL) { } -//For target LOONGSON3R5, applying an offset to the buffer is essential +//For LOONGARCH64, applying an offset to the buffer is essential //for minimizing cache conflicts and optimizing performance. #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c index 44de596698..51196c6b87 100644 --- a/driver/others/dynamic_loongarch64.c +++ b/driver/others/dynamic_loongarch64.c @@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -extern gotoblas_t gotoblas_LOONGSON3R5; -extern gotoblas_t gotoblas_LOONGSON2K1000; -extern gotoblas_t gotoblas_LOONGSONGENERIC; +#define NUM_CORETYPES 6 +#define LOONGARCH_CFG0 0x00 +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) +#define PRID_SERIES_MASK 0xf000 +#define PRID_SERIES_LA264 0xa000 +#define PRID_SERIES_LA364 0xb000 +#define PRID_SERIES_LA464 0xc000 +#define PRID_SERIES_LA664 0xd000 + +extern gotoblas_t gotoblas_LA64_GENERIC; +extern gotoblas_t gotoblas_LA264; +extern gotoblas_t gotoblas_LA464; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 3 - static char *corename[] = { - "loongson3r5", - "loongson2k1000", + "la64_generic", + "la264", + "la464", "loongsongeneric", + "loongson2k1000", + "loongson3r5", "unknown" }; char *gotoblas_corename(void) { - if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; - if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; - if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; + if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0]; + if (gotoblas == &gotoblas_LA264) return corename[1]; + if (gotoblas == &gotoblas_LA464) return corename[2]; return corename[NUM_CORETYPES]; } @@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) { switch (found) { - case 0: return (&gotoblas_LOONGSON3R5); - case 1: return (&gotoblas_LOONGSON2K1000); - case 2: return (&gotoblas_LOONGSONGENERIC); + case 0: return (&gotoblas_LA64_GENERIC); + case 1: return (&gotoblas_LA264); + case 2: return (&gotoblas_LA464); + case 3: return (&gotoblas_LA64_GENERIC); + case 4: return (&gotoblas_LA264); + case 5: return (&gotoblas_LA464); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); return NULL; } -#define LA_HWCAP_LSX (1U << 4) -#define LA_HWCAP_LASX (1U << 5) -static gotoblas_t *get_coretype(void) { - int hwcap = (int)getauxval(AT_HWCAP); +/* Detect whether the OS supports the LASX instruction set */ +static int os_support_lasx() { + int hwcap = (int)getauxval(AT_HWCAP); if (hwcap & LA_HWCAP_LASX) - return &gotoblas_LOONGSON3R5; - else if (hwcap & LA_HWCAP_LSX) - return &gotoblas_LOONGSON2K1000; + return 1; + else + return 0; +} + +/* Detect whether the OS supports the LSX instruction set */ +static int os_support_lsx() { + int hwcap = (int)getauxval(AT_HWCAP); + + if (hwcap & LA_HWCAP_LSX) + return 1; else - return &gotoblas_LOONGSONGENERIC; + return 0; +} + +static uint32_t get_prid() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG0) + ); + return reg; +} + +/* Select core at runtime based on the + * cpu name and SIMD instructions supported + * by the system + */ +static gotoblas_t *get_coretype(void) { + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA464): + case (PRID_SERIES_LA664): + if (os_support_lasx()) + return &gotoblas_LA464; + else if (os_support_lsx()) + return &gotoblas_LA264; + else + return &gotoblas_LA64_GENERIC; + break; + + case (PRID_SERIES_LA264): + case (PRID_SERIES_LA364): + if (os_support_lsx()) + return &gotoblas_LA264; + else + return &gotoblas_LA64_GENERIC; + break; + + default: + return &gotoblas_LA64_GENERIC; + break; + } } void gotoblas_dynamic_init(void) { diff --git a/driver/others/parameter.c b/driver/others/parameter.c index a208a1a9d7..597e5cac7e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -752,7 +752,7 @@ int get_L3_size() { } void blas_set_parameter(void){ -#if defined(LOONGSON3R5) +#if defined(LA464) int L3_size = get_L3_size(); #ifdef SMP if(blas_num_threads == 1){ diff --git a/getarch.c b/getarch.c index 842a843fad..826dd1ce0a 100644 --- a/getarch.c +++ b/getarch.c @@ -135,11 +135,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CELL */ /* #define FORCE_MIPS64_GENERIC */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3R3 */ -/* #define FORCE_LOONGSON3R4 */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_LOONGSON3R5 */ /* #define FORCE_LOONGSON2K1000 */ /* #define FORCE_LOONGSONGENERIC */ +/* #define FORCE_LA64_GENERIC */ +/* #define FORCE_LA264 */ +/* #define FORCE_LA464 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -153,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_EV5 */ /* #define FORCE_EV6 */ /* #define FORCE_CSKY */ -/* #define FORCE_CK860FV */ +/* #define FORCE_CK860FV */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 @@ -979,46 +982,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_LOONGSON3R5 +#if defined(FORCE_LA464) || defined(FORCE_LOONGSON3R5) #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSON3R5" +#ifdef NO_LASX +#ifdef NO_LSX +#define SUBARCHITECTURE "LA64_GENERIC" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSON3R5 " \ +#define ARCHCONFIG "-DLA64_GENERIC " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ - "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" -#define LIBNAME "loongson3r5" -#define CORENAME "LOONGSON3R5" + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la64_generic" +#define CORENAME "LA64_GENERIC" #else +#define SUBARCHITECTURE "LA264" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLA264 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la264" +#define CORENAME "LA264" +#endif +#else +#define SUBARCHITECTURE "LA464" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLA464 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la464" +#define CORENAME "LA464" +#endif #endif -#ifdef FORCE_LOONGSON2K1000 +#if defined(FORCE_LA264) || defined(FORCE_LOONGSON2K1000) #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSON2K1000" +#ifdef NO_LSX +#define SUBARCHITECTURE "LA64_GENERIC" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSON2K1000 " \ +#define ARCHCONFIG "-DLA64_GENERIC " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" -#define LIBNAME "loongson2k1000" -#define CORENAME "LOONGSON2K1000" + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la64_generic" +#define CORENAME "LA64_GENERIC" #else +#define SUBARCHITECTURE "LA264" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLA264 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la264" +#define CORENAME "LA264" +#endif #endif -#ifdef FORCE_LOONGSONGENERIC +#if defined(FORCE_LA64_GENERIC) || defined(FORCE_LOONGSONGENERIC) #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSONGENERIC" +#define SUBARCHITECTURE "LA64_GENERIC" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSONGENERIC " \ +#define ARCHCONFIG "-DLA64_GENERIC " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" -#define LIBNAME "loongsongeneric" -#define CORENAME "LOONGSONGENERIC" -#else + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la64_generic" +#define CORENAME "LA64_GENERIC" #endif #ifdef FORCE_I6400 diff --git a/interface/gemm.c b/interface/gemm.c index 64b8b620cf..c030947b6f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -572,7 +572,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS buffer = (XFLOAT *)blas_memory_alloc(0); -//For target LOONGSON3R5, applying an offset to the buffer is essential +//For LOONGARCH64, applying an offset to the buffer is essential //for minimizing cache conflicts and optimizing performance. #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LA264 similarity index 100% rename from kernel/loongarch64/KERNEL.LOONGSON2K1000 rename to kernel/loongarch64/KERNEL.LA264 diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LA464 similarity index 100% rename from kernel/loongarch64/KERNEL.LOONGSON3R5 rename to kernel/loongarch64/KERNEL.LA464 diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 9d494bfc62..fa61a209e1 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1086,7 +1086,7 @@ static void init_parameter(void) { TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif -#if defined(LOONGSON3R5) +#if defined(LA464) int L3_size = get_L3_size(); #ifdef SMP if(blas_num_threads == 1){ diff --git a/param.h b/param.h index 0e4d8965d9..66eedc7980 100644 --- a/param.h +++ b/param.h @@ -2838,7 +2838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined (LOONGSON3R5) +#if defined (LA464) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2891,7 +2891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON2K1000 +#ifdef LA264 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL @@ -2926,7 +2926,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSONGENERIC +#ifdef LA64_GENERIC #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL