From 448fe1c42a928f2a090f7ec1a5be9b6e81ccbf0f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 24 Aug 2022 19:24:01 +0800 Subject: [PATCH 01/36] Test on PolarFire Soc. `make NOFORTRAN=1 CC=gcc` --- TargetList.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index deef758195..99d603d030 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -118,7 +118,7 @@ Z13 Z14 10.RISC-V 64: -RISCV64_GENERIC +RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) C910V 11.LOONGARCH64: From bef47917bd72f35c151038fee0cf485445476863 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 00:06:25 -0800 Subject: [PATCH 02/36] Initial version for riscv sifive x280 --- Makefile.install | 7 + Makefile.prebuild | 8 + Makefile.riscv64 | 8 + README.md | 5 + TargetList.txt | 1 + benchmark/Makefile | 6 + common_riscv64.h | 4 + cpuid_riscv64.c | 2 + getarch.c | 12 + kernel/riscv64/KERNEL.x280 | 267 ++++++++ kernel/riscv64/amax_rvv.c | 102 +++ kernel/riscv64/amin_rvv.c | 102 +++ kernel/riscv64/asum_rvv.c | 99 +++ kernel/riscv64/axpby_rvv.c | 171 +++++ kernel/riscv64/axpy_rvv.c | 109 +++ kernel/riscv64/copy_rvv.c | 94 +++ kernel/riscv64/dot_rvv.c | 126 ++++ kernel/riscv64/gemm_beta_rvv.c | 89 +++ kernel/riscv64/gemm_ncopy_2_rvv.c | 92 +++ kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ++++ kernel/riscv64/gemm_ncopy_8_rvv.c | 164 +++++ kernel/riscv64/gemm_ncopy_rvv_v1.c | 76 +++ kernel/riscv64/gemm_tcopy_2_rvv.c | 108 +++ kernel/riscv64/gemm_tcopy_4_rvv.c | 236 +++++++ kernel/riscv64/gemm_tcopy_8_rvv.c | 264 ++++++++ kernel/riscv64/gemm_tcopy_rvv_v1.c | 74 +++ kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ++++++ kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ++++++++++++++ kernel/riscv64/gemmkernel_rvv_v1x8.c | 601 +++++++++++++++++ kernel/riscv64/gemv_n_rvv.c | 94 +++ kernel/riscv64/gemv_t_rvv.c | 119 ++++ kernel/riscv64/iamax_rvv.c | 150 +++++ kernel/riscv64/iamin_rvv.c | 151 +++++ kernel/riscv64/imax_rvv.c | 147 +++++ kernel/riscv64/imin_rvv.c | 147 +++++ kernel/riscv64/izamax_rvv.c | 162 +++++ kernel/riscv64/izamin_rvv.c | 161 +++++ kernel/riscv64/max_rvv.c | 98 +++ kernel/riscv64/min_rvv.c | 98 +++ kernel/riscv64/nrm2_rvv.c | 117 ++++ kernel/riscv64/rot_rvv.c | 149 +++++ kernel/riscv64/scal_rvv.c | 80 +++ kernel/riscv64/sum_rvv.c | 95 +++ kernel/riscv64/swap_rvv.c | 142 ++++ kernel/riscv64/symm_lcopy_rvv_v1.c | 101 +++ kernel/riscv64/symm_ucopy_rvv_v1.c | 100 +++ kernel/riscv64/symv_L_rvv.c | 224 +++++++ kernel/riscv64/symv_U_rvv.c | 221 +++++++ kernel/riscv64/trmm_lncopy_rvv_v1.c | 138 ++++ kernel/riscv64/trmm_ltcopy_rvv_v1.c | 134 ++++ kernel/riscv64/trmm_uncopy_rvv_v1.c | 136 ++++ kernel/riscv64/trmm_utcopy_rvv_v1.c | 133 ++++ kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ++++++++++ kernel/riscv64/trmmkernel_4x4_rvv.c | 881 +++++++++++++++++++++++++ kernel/riscv64/trmmkernel_rvv_v1x8.c | 685 +++++++++++++++++++ kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 847 ++++++++++++++++++++++++ kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 840 +++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 792 ++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 828 +++++++++++++++++++++++ kernel/riscv64/trsm_lncopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_ltcopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_uncopy_rvv_v1.c | 121 ++++ kernel/riscv64/trsm_utcopy_rvv_v1.c | 123 ++++ kernel/riscv64/zamax_rvv.c | 113 ++++ kernel/riscv64/zamin_rvv.c | 112 ++++ kernel/riscv64/zasum_rvv.c | 108 +++ kernel/riscv64/zaxpby_rvv.c | 151 +++++ kernel/riscv64/zaxpy_rvv.c | 154 +++++ kernel/riscv64/zcopy_rvv.c | 105 +++ kernel/riscv64/zdot_rvv.c | 170 +++++ kernel/riscv64/zgemm_beta_rvv.c | 117 ++++ kernel/riscv64/zgemv_n_rvv.c | 170 +++++ kernel/riscv64/zgemv_t_rvv.c | 172 +++++ kernel/riscv64/znrm2_rvv.c | 122 ++++ kernel/riscv64/zrot_rvv.c | 181 +++++ kernel/riscv64/zscal_rvv.c | 148 +++++ kernel/riscv64/zsum_rvv.c | 97 +++ kernel/riscv64/zswap_rvv.c | 156 +++++ kernel/riscv64/ztrmmkernel_2x2_rvv.c | 596 +++++++++++++++++ param.h | 44 ++ 80 files changed, 15188 insertions(+) create mode 100644 kernel/riscv64/KERNEL.x280 create mode 100644 kernel/riscv64/amax_rvv.c create mode 100644 kernel/riscv64/amin_rvv.c create mode 100644 kernel/riscv64/asum_rvv.c create mode 100644 kernel/riscv64/axpby_rvv.c create mode 100644 kernel/riscv64/axpy_rvv.c create mode 100644 kernel/riscv64/copy_rvv.c create mode 100644 kernel/riscv64/dot_rvv.c create mode 100644 kernel/riscv64/gemm_beta_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_rvv_v1.c create mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_rvv_v1.c create mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/gemmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/gemv_n_rvv.c create mode 100644 kernel/riscv64/gemv_t_rvv.c create mode 100644 kernel/riscv64/iamax_rvv.c create mode 100644 kernel/riscv64/iamin_rvv.c create mode 100644 kernel/riscv64/imax_rvv.c create mode 100644 kernel/riscv64/imin_rvv.c create mode 100644 kernel/riscv64/izamax_rvv.c create mode 100644 kernel/riscv64/izamin_rvv.c create mode 100644 kernel/riscv64/max_rvv.c create mode 100644 kernel/riscv64/min_rvv.c create mode 100644 kernel/riscv64/nrm2_rvv.c create mode 100644 kernel/riscv64/rot_rvv.c create mode 100644 kernel/riscv64/scal_rvv.c create mode 100644 kernel/riscv64/sum_rvv.c create mode 100644 kernel/riscv64/swap_rvv.c create mode 100644 kernel/riscv64/symm_lcopy_rvv_v1.c create mode 100644 kernel/riscv64/symm_ucopy_rvv_v1.c create mode 100644 kernel/riscv64/symv_L_rvv.c create mode 100644 kernel/riscv64/symv_U_rvv.c create mode 100644 kernel/riscv64/trmm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/trmmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/trsm_kernel_LN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_LT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/zamax_rvv.c create mode 100644 kernel/riscv64/zamin_rvv.c create mode 100644 kernel/riscv64/zasum_rvv.c create mode 100644 kernel/riscv64/zaxpby_rvv.c create mode 100644 kernel/riscv64/zaxpy_rvv.c create mode 100644 kernel/riscv64/zcopy_rvv.c create mode 100644 kernel/riscv64/zdot_rvv.c create mode 100644 kernel/riscv64/zgemm_beta_rvv.c create mode 100644 kernel/riscv64/zgemv_n_rvv.c create mode 100644 kernel/riscv64/zgemv_t_rvv.c create mode 100644 kernel/riscv64/znrm2_rvv.c create mode 100644 kernel/riscv64/zrot_rvv.c create mode 100644 kernel/riscv64/zscal_rvv.c create mode 100644 kernel/riscv64/zsum_rvv.c create mode 100644 kernel/riscv64/zswap_rvv.c create mode 100644 kernel/riscv64/ztrmmkernel_2x2_rvv.c diff --git a/Makefile.install b/Makefile.install index 87b5bc8701..f1adaa2719 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,6 +8,7 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin +OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -38,6 +39,7 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -202,3 +204,8 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! +#Generating release tar + @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz + @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . + + diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f12741..e6a8eab597 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), x280) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +endif + +ifeq ($(TARGET), RISCV64_GENERIC) +TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce91e03ecd..d6eaf552d6 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -2,3 +2,11 @@ ifeq ($(CORE), C910V) CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif +ifeq ($(CORE), x280) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +endif +ifeq ($(CORE), RISCV64_GENERIC) +CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +endif \ No newline at end of file diff --git a/README.md b/README.md index 6ce85e08eb..6ecb461786 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ``` (also known to work on C906) +- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. + ```sh + make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/TargetList.txt b/TargetList.txt index deef758195..6c533361e1 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -120,6 +120,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC C910V +x280 11.LOONGARCH64: LOONGSONGENERIC diff --git a/benchmark/Makefile b/benchmark/Makefile index f2f3b354a4..734c83a264 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +# x280 temporary workaround for gfortran +ifeq ($(TARGET), x280) +CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) +endif + + ifneq ($(NO_LAPACK), 1) GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ diff --git a/common_riscv64.h b/common_riscv64.h index 7ddbe80a46..221a799016 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,6 +92,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) +#include +#endif + +#if defined(x280) #include #endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873d..5326787e6b 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 +#define CPU_x280 2 static char *cpuname[] = { "RISCV64_GENERIC", "C910V" + "x280" }; int detect(void){ diff --git a/getarch.c b/getarch.c index cde5b4e83f..0d197285ce 100644 --- a/getarch.c +++ b/getarch.c @@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "c910v" #define CORENAME "C910V" #endif +#endif +#ifdef FORCE_x280 +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "x280" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-Dx280 " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "x280" +#define CORENAME "x280" #else #endif diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 new file mode 100644 index 0000000000..2eb60f2b46 --- /dev/null +++ b/kernel/riscv64/KERNEL.x280 @@ -0,0 +1,267 @@ +# ********************************************************************************** +# Copyright (c) 2022, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +CTRMMKERNEL = ztrmmkernel_2x2_rvv.c +ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c + +# SGEMM_UNROLL_N set in params.h +ifeq ($(SGEMM_UNROLL_N), 2) +SGEMMKERNEL = gemmkernel_2x2_rvv.c +SGEMMONCOPY = gemm_ncopy_2_rvv.c +SGEMMOTCOPY = gemm_tcopy_2_rvv.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMKERNEL = gemmkernel_4x4_rvv.c +SGEMMONCOPY = gemm_ncopy_4_rvv.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +SGEMMKERNEL = gemmkernel_rvv_v1x8.c +SGEMMINCOPY = gemm_ncopy_rvv_v1.c +SGEMMITCOPY = gemm_tcopy_rvv_v1.c +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMKERNEL = trmmkernel_rvv_v1x8.c + +STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +SSYMMUCOPY_M = symm_ucopy_rvv_v1.c +SSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +# SGEMM_UNROLL_N set in params.h +ifeq ($(DGEMM_UNROLL_N), 2) +DGEMMKERNEL = gemmkernel_2x2_rvv.c +DGEMMONCOPY = gemm_ncopy_2_rvv.c +DGEMMOTCOPY = gemm_tcopy_2_rvv.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMKERNEL = gemmkernel_4x4_rvv.c +DGEMMONCOPY = gemm_ncopy_4_rvv.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +DGEMMKERNEL = gemmkernel_rvv_v1x8.c +DGEMMINCOPY = gemm_ncopy_rvv_v1.c +DGEMMITCOPY = gemm_tcopy_rvv_v1.c +DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = trmmkernel_rvv_v1x8.c +DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +DSYMMUCOPY_M = symm_ucopy_rvv_v1.c +DSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c new file mode 100644 index 0000000000..c9c6e7f730 --- /dev/null +++ b/kernel/riscv64/amax_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c new file mode 100644 index 0000000000..370b6c3388 --- /dev/null +++ b/kernel/riscv64/amin_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c new file mode 100644 index 0000000000..4f711c9be0 --- /dev/null +++ b/kernel/riscv64/asum_rvv.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + asumf = VFMVFS_FLOAT_M1(v_res); + return(asumf); +} diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c new file mode 100644 index 0000000000..7c35c563d1 --- /dev/null +++ b/kernel/riscv64/axpby_rvv.c @@ -0,0 +1,171 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + FLOAT_V_T vx, vy; + + if ( n < 0 ) return(0); + + if ( beta == 0.0 ) { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + memset(&y[0], 0, n * sizeof(FLOAT)); + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + size_t vl = VSETVL(n); + vy = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) { + vl = VSETVL(n); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + + } else { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + } + + return(0); +} diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c new file mode 100644 index 0000000000..3986f4e212 --- /dev/null +++ b/kernel/riscv64/axpy_rvv.c @@ -0,0 +1,109 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( n <= 0 ) return(0); + if ( da == 0.0 ) return(0); + + FLOAT_V_T vx, vy; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT(y, vy, vl); + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c new file mode 100644 index 0000000000..5d5a8bd049 --- /dev/null +++ b/kernel/riscv64/copy_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1 && inc_y == 1) { + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if(inc_x == 1) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c new file mode 100644 index 0000000000..60dcac2f57 --- /dev/null +++ b/kernel/riscv64/dot_rvv.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + double dot = 0.0; + + if ( n <= 0 ) return(dot); + + size_t vlmax = vsetvlmax_e64m8(); + vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } + + vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); + dot = vfmv_f_s_f64m1_f64(vec_sum); + + return(dot); +} diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c new file mode 100644 index 0000000000..34d1ea0780 --- /dev/null +++ b/kernel/riscv64/gemm_beta_rvv.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#endif + +// Optimizes the implementation in ../generic/gemm_beta.c + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vx; + + if (beta == ZERO) { + + vl = VSETVL(m); + vx = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + vx = VLEV_FLOAT(c_offset, vl); + vx = VFMULVF_FLOAT(vx, beta, vl); + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c new file mode 100644 index 0000000000..5f55bc349e --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_2_rvv.c @@ -0,0 +1,92 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEG2_FLOAT vsseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEG2_FLOAT vsseg2e64_v_f64m4 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset; + FLOAT_V_T v1, v2; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + + for(j = (n >> 1); j > 0; j--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset, vl); + VSEV_FLOAT(b_offset, v1, vl); + + a_offset += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c new file mode 100644 index 0000000000..4d4efe4c95 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_4_rvv.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG4_FLOAT vsseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG4_FLOAT vsseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c new file mode 100644 index 0000000000..525b223c20 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_8.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 3); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c new file mode 100644 index 0000000000..2c5230752c --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -0,0 +1,76 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda; + + for(i = m; i > 0; i--) { + v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(b_offset, v0, vl); + + a_offset1++; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c new file mode 100644 index 0000000000..963e1be695 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_2_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT_V_T v1a, v1b, v2a, v2b; + size_t vl; + + //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + for(i = (m >> 1); i > 0; i--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); + VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); + + VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); + + a_offset1 += vl * 2; + a_offset2 += vl * 2; + b_offset1 += vl * m * 2; + } + + if (n & 1) { + *(b_offset2 + 0) = *(a_offset1 + 0); + *(b_offset2 + 1) = *(a_offset2 + 0); + b_offset2 += 2; + } + } + + if (m & 1) { + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); + + VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); + + a_offset += vl * 2; + b_offset += vl * m * 2; + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c new file mode 100644 index 0000000000..ac9974b24b --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_4_rvv.c @@ -0,0 +1,236 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + for(i = (n >> 2); i > 0; i--) { + v1 = VLEV_FLOAT(a_offset1, 4); + v2 = VLEV_FLOAT(a_offset2, 4); + v3 = VLEV_FLOAT(a_offset3, 4); + v4 = VLEV_FLOAT(a_offset4, 4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + VSEV_FLOAT(b_offset1, v1, 4); + VSEV_FLOAT(b_offset2+4, v2, 4); + VSEV_FLOAT(b_offset2+8, v3, 4); + VSEV_FLOAT(b_offset2+12, v4, 4); + + b_offset1 += m * 4; + } + + if (n & 2) { + v1 = VLEV_FLOAT(a_offset1, 2); + v2 = VLEV_FLOAT(a_offset2, 2); + v3 = VLEV_FLOAT(a_offset3, 2); + v4 = VLEV_FLOAT(a_offset4, 2); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + VSEV_FLOAT(b_offset2, v1, 2); + VSEV_FLOAT(b_offset2+2, v2, 2); + VSEV_FLOAT(b_offset2+4, v3, 2); + VSEV_FLOAT(b_offset2+6, v4, 2); + + b_offset2 += 8; + } + + if (n & 1) { + v1 = VLEV_FLOAT(a_offset1, 1); + v2 = VLEV_FLOAT(a_offset2, 1); + v3 = VLEV_FLOAT(a_offset3, 1); + v4 = VLEV_FLOAT(a_offset4, 1); + + VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); + + b_offset3 += 4; + } + + } + +// TODO cleanup + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c new file mode 100644 index 0000000000..81c1f962bc --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -0,0 +1,264 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEV_FLOAT vlse32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEV_FLOAT vlse64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + + // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + for(j = (m >> 3); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 8; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 32; + } + + if (n & 2) { + size_t vl = 8; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 16; + } + + if (n & 1) { + size_t vl = 8; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 8; + } + + } + + if (m & 4) { + + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 4; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 4; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 2) { + size_t vl = 4; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 8; + } + + if (n & 1) { + size_t vl = 4; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 4; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 2; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 2; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 2) { + size_t vl = 2; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 4; + } + + if (n & 1) { + size_t vl = 2; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 2; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 4) { + size_t vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 2) { + size_t vl = 2; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset3, v0, vl); + + aoffset1 += 2; + // boffset3 += 2; + } + + if (n & 1) { + *(boffset4) = *(aoffset1); + // aoffset1 ++; + // boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c new file mode 100644 index 0000000000..a291b70b81 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl; + + for(i = m; i > 0; i--) { + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset, v0, vl); + + aoffset1 += lda; + boffset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c new file mode 100644 index 0000000000..ec8961ced7 --- /dev/null +++ b/kernel/riscv64/gemmkernel_2x2_rvv.c @@ -0,0 +1,214 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG2_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG2_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/2; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c new file mode 100644 index 0000000000..aa58bcc766 --- /dev/null +++ b/kernel/riscv64/gemmkernel_4x4_rvv.c @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEG2_FLOAT vlseg2e32_v_f32m1 +#define VLSEG4_FLOAT vlseg4e32_v_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m1 +#define VFMACCVF_FLOAT vfmacc_vf_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEG2_FLOAT vlseg2e64_v_f64m1 +#define VLSEG4_FLOAT vlseg4e64_v_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m1 +#define VFMACCVF_FLOAT vfmacc_vf_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU + + FLOAT_V_T va0, va1, va2, va3; + FLOAT_V_T vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/4; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + vres8 = VFMVVF_FLOAT(0.0, vlmax); + vres9 = VFMVVF_FLOAT(0.0, vlmax); + vres10 = VFMVVF_FLOAT(0.0, vlmax); + vres11 = VFMVVF_FLOAT(0.0, vlmax); + vres12 = VFMVVF_FLOAT(0.0, vlmax); + vres13 = VFMVVF_FLOAT(0.0, vlmax); + vres14 = VFMVVF_FLOAT(0.0, vlmax); + vres15 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); + vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); + vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); + vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); + + vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); + vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); + vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); + vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); + + ptrba += vl*4; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + C2 += 4; + C3 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + ptrba += vl*2; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + C1 += 2; + C2 += 2; + C3 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 1; + C1 += 1; + C2 += 1; + C3 += 1; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + if(bn & 2) { + + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); + vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); + + ptrba += vl*4; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + ptrba += vl*4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c new file mode 100644 index 0000000000..5cd509f93a --- /dev/null +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -0,0 +1,601 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#endif + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + // N:8 + for (j = bn/8; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + C4 = C3 + ldc; + C5 = C4 + ldc; + C6 = C5 + ldc; + C7 = C6 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrba += vl; + ptrbb += 8; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VLEV_FLOAT(C4, vl); + va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VLEV_FLOAT(C5, vl); + va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VLEV_FLOAT(C6, vl); + va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VLEV_FLOAT(C7, vl); + va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); + VSEV_FLOAT(C7, va7, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + + bb += (bk<<3); + C += (ldc<<3); + } + + // N:4 + if (bn & 4) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrba += vl; + ptrbb += 4; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + // N:2 + if (bn & 2) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrba += vl; + ptrbb += 2; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + C0 += vl; + C1 += vl; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + // N:1 + if (bn & 1) { + C0 = C; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrba += vl; + ptrbb += 1; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + C0 += vl; + } + + bb += (bk); + C += (ldc); + } + + return 0; +} diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c new file mode 100644 index 0000000000..9d2dee6158 --- /dev/null +++ b/kernel/riscv64/gemv_n_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + if(n < 0) return(0); + + FLOAT *a_ptr, *x_ptr; + BLASLONG i; + FLOAT_V_T va, vy; + + if(inc_y == 1) { + + for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLEV_FLOAT(y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSEV_FLOAT(y, vy, vl); + } + + } else { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLSEV_FLOAT(y, stride_y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + return(0); +} diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c new file mode 100644 index 0000000000..a80af81b63 --- /dev/null +++ b/kernel/riscv64/gemv_t_rvv.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT *a_ptr, *x_ptr; + + FLOAT_V_T va, vx, vr; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + if(inc_x == 1) { + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLEV_FLOAT(x_ptr, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLSEV_FLOAT(x_ptr, stride_x, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } + + return(0); +} diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c new file mode 100644 index 0000000000..8b33b3bcbe --- /dev/null +++ b/kernel/riscv64/iamax_rvv.c @@ -0,0 +1,150 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c new file mode 100644 index 0000000000..585b371861 --- /dev/null +++ b/kernel/riscv64/iamin_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c new file mode 100644 index 0000000000..d84ad968e2 --- /dev/null +++ b/kernel/riscv64/imax_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_min; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c new file mode 100644 index 0000000000..fb734f6f8a --- /dev/null +++ b/kernel/riscv64/imin_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c new file mode 100644 index 0000000000..9cb332cbb6 --- /dev/null +++ b/kernel/riscv64/izamax_rvv.c @@ -0,0 +1,162 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + } + else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + + } + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c new file mode 100644 index 0000000000..69771e5aa5 --- /dev/null +++ b/kernel/riscv64/izamin_rvv.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c new file mode 100644 index 0000000000..5b1380d2b2 --- /dev/null +++ b/kernel/riscv64/max_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c new file mode 100644 index 0000000000..bddcc0ba7d --- /dev/null +++ b/kernel/riscv64/min_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c new file mode 100644 index 0000000000..3f5d50397e --- /dev/null +++ b/kernel/riscv64/nrm2_rvv.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define ABS fabsf +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define ABS fabs +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if( n <= 0 ) return(0.0); + if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl * inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c new file mode 100644 index 0000000000..7bf5e42703 --- /dev/null +++ b/kernel/riscv64/rot_rvv.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if(n <= 0) return(0); + + FLOAT_V_T v0, v1, vx, vy; + + if (inc_x == 0 || inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_y == 1) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_x == 1) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c new file mode 100644 index 0000000000..d2c0378bfe --- /dev/null +++ b/kernel/riscv64/scal_rvv.c @@ -0,0 +1,80 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( (n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + + } + + return 0; +} diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c new file mode 100644 index 0000000000..1db0d09ddf --- /dev/null +++ b/kernel/riscv64/sum_rvv.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + sumf = VFMVFS_FLOAT_M1(v_res); + return(sumf); +} diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c new file mode 100644 index 0000000000..2cf92f6ad9 --- /dev/null +++ b/kernel/riscv64/swap_rvv.c @@ -0,0 +1,142 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG stride_x, stride_y; + FLOAT_V_T vx, vy; + + if (n <= 0) return(0); + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp = x[0]; + x[0] = y[0]; + y[0] = temp; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp = x[0]; + x[0] = y[(n - 1) * inc_y]; + FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one + stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) { + vl = VSETVL(m); + vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); + VSSEV_FLOAT(ptr, stride_y, vy, vl); + } + y[0] = temp; + } + else if(inc_y == 0) { + FLOAT temp = y[0]; + y[0] = x[(n - 1) * inc_x]; + FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one + stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { + vl = VSETVL(m); + vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); + VSSEV_FLOAT(ptr, stride_x, vx, vl); + } + x[0] = temp; + } + else if(inc_x == 1 && inc_y == 1) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else if (inc_y == 1) { + stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + + } else if(inc_x == 1) { + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c new file mode 100644 index 0000000000..f0def96176 --- /dev/null +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -0,0 +1,101 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_lcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX + posY * lda; + ao2 = a + posY + (posX) * lda; + + for (i = m; i > 0; i--, offset--) { + va2 = VLSEV_FLOAT(ao2, stride_lda, vl); + va1 = VLEV_FLOAT(ao1, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1 += lda; + ao2++; + } + } + + return 0; +} + diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c new file mode 100644 index 0000000000..958506df31 --- /dev/null +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_ucopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posX + 0 + posY * lda; + + for (i = m; i > 0; i--, offset--) { + va1 = VLSEV_FLOAT(ao1, stride_lda, vl); + va2 = VLEV_FLOAT(ao2, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1++; + ao2 += lda; + } + } + + return 0; +} diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c new file mode 100644 index 0000000000..737abaae3e --- /dev/null +++ b/kernel/riscv64/symv_L_rvv.c @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + if(inc_x == 1 && inc_y == 1) + { + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jy += inc_y; + a_ptr += lda; + } + } + else if(inc_y == 1) + { + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + inc_xv = inc_x * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + a_ptr += lda; + } + } + else + { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c new file mode 100644 index 0000000000..cb923be5d1 --- /dev/null +++ b/kernel/riscv64/symv_U_rvv.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + FLOAT_V_T_M1 v_res, v_z0; + size_t vl_max = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vl_max); + v_z0 = VFMVVF_FLOAT_M1(0, vl_max); + vl_max = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + if(inc_x == 1 && inc_y == 1) + { + a_ptr += m1 * lda; + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jy += inc_y; + } + } + else if(inc_y == 1) + { + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + vr = VFMVVF_FLOAT(0, vl); + + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + } + } + else + { + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c new file mode 100644 index 0000000000..73a8233f8d --- /dev/null +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT vid_v_i32m2 +#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c new file mode 100644 index 0000000000..2fe8cf79e1 --- /dev/null +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c new file mode 100644 index 0000000000..b64cd840d0 --- /dev/null +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda; + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X > posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c new file mode 100644 index 0000000000..b96daae5be --- /dev/null +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -0,0 +1,133 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T vb, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + ao ++; + b += vl; + X ++; + i++; + } + else if (X > posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + ao += lda; + b += vl; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c new file mode 100644 index 0000000000..127e76970a --- /dev/null +++ b/kernel/riscv64/trmmkernel_2x2_rvv.c @@ -0,0 +1,342 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_2x2.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 v_res, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); + C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+2; + C1 = C1+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + C1 = C1+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off; +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + vb0 = VLEV_FLOAT(ptrbb, vl); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); + + ptrba += vl * 2; + ptrbb += vl; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp; +#endif +#ifdef LEFT + off += 2; +#endif + + C0 = C0+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + ptrba += vl; + ptrbb += vl; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp; + ptrbb += temp; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c new file mode 100644 index 0000000000..3e46c6348b --- /dev/null +++ b/kernel/riscv64/trmmkernel_4x4_rvv.c @@ -0,0 +1,881 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMUL_FLOAT vfmul_vv_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMUL_FLOAT vfmul_vv_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_4x4.c + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + + FLOAT_V_T vres0_0; + FLOAT_V_T vres0_1; + FLOAT_V_T vres0_2; + FLOAT_V_T vres0_3; + + FLOAT_V_T vres1_0; + FLOAT_V_T vres1_1; + FLOAT_V_T vres1_2; + FLOAT_V_T vres1_3; + + FLOAT_V_T vres2_0; + FLOAT_V_T vres2_1; + FLOAT_V_T vres2_2; + FLOAT_V_T vres2_3; + + FLOAT_V_T vres3_0; + FLOAT_V_T vres3_1; + FLOAT_V_T vres3_2; + FLOAT_V_T vres3_3; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); + vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); + vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); + + ptrba += vl * 4; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + if (!backwards) { + temp = bk-off; + temp = left ? temp - 4 : // number of values in A + temp - 4; // number of values in B + + ptrba += temp*4; // number of values in A + ptrbb += temp*4; // number of values in B + } +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + C2 = C2+4; + C3 = C3+4; + + } + + if ( bm & 2 ) // do any 2x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres2_1 = VFMVVF_FLOAT(0, vlmax); + + vres3_0 = VFMVVF_FLOAT(0, vlmax); + vres3_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+4; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + ptrba += vl * 2; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); + + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + C2 = C2+2; + C3 = C3+2; + + } + + if ( bm & 1 ) // do any 1x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres3_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + C2 = C2+1; + C3 = C3+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + for (j=0; j<(bn&2); j+=2) // do the Mx2 loops + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + + ptrba += vl * 4; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + + } + + if ( bm & 2 ) // do any 2x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + + } + + if ( bm & 1 ) // do any 1x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*2; +#endif + + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + for (j=0; j<(bn&1); j+=1) // do the Mx1 loops + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + + ptrba += vl * 4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + + } + + if ( bm & 2 ) // do any 2x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + ptrba += vl * 2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + + } + + if ( bm & 1 ) // do any 1x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c new file mode 100644 index 0000000000..97b14650c2 --- /dev/null +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -0,0 +1,685 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_8x8.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc); + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + for (j = bn/8; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + C4 = C3+ldc; + C5 = C4+ldc; + C6 = C5+ldc; + C7 = C6+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*8; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+8; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VFMULVF_FLOAT(vres4, alpha, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VFMULVF_FLOAT(vres5, alpha, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VFMULVF_FLOAT(vres6, alpha, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VFMULVF_FLOAT(vres7, alpha, vl); + VSEV_FLOAT(C7, va7, vl); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 8; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*8; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; +#endif + + bb += (bk<<3); + C += (ldc<<3); + } + + if (bn & 4) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*4; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk<<2); + C += (ldc<<2); + } + + if (bn & 2) + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + bb += (bk<<1); + C += (ldc<<1); + } + + if (bn & 1) + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*1; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + bb += (bk); + C += (ldc); + } + return 0; +} + diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c new file mode 100644 index 0000000000..11a0398ca1 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -0,0 +1,847 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + pa = a; + pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc0, vc1; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + //bb = *(c + i + j * ldc); + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + //bb *= aa; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + //*b = bb; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + + //*(c + i + j * ldc) = bb; + //b ++; + + b += 2; + //pa = a + i + 1; + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + pa = a; + //pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + + b += 8; + + pc0 = c + j * ldc * 8; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#else +static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * j * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c new file mode 100644 index 0000000000..0380bd1bbe --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c + +#ifndef COMPLEX +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc; + for (i = 0; i < m; i++) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b++; + pa = a + i + 1; + pc = c + j * ldc + i + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc0, vc1; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + b += 8; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + + +static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + FLOAT *pa, *pc; + + int i, j, k; + + size_t vl; + FLOAT_V_T va0, va1, vc0, vc1; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + pa = a + (i + 1) * 2; + pc = c + j * ldc + (i + 1) * 2; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, pa, vl); + VLSEG2_FLOAT(&vc0, &vc1, pc, vl); +#ifndef CONJ + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#else + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); + vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#endif + VSSEG2_FLOAT(pc, vc0, vc1, vl); + pa += vl * 2; + pc += vl * 2; + } + } + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c new file mode 100644 index 0000000000..41368be600 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -0,0 +1,792 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + + for (j = 0; j < m; j ++) + { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b + i + 1; + pc = c + j + (i + 1) *ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc ++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + + a += 8; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + pc4 = pa4 + ldc; + pc5 = pa5 + ldc; + pc6 = pa6 + ldc; + pc7 = pa7 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} +#else +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = vl; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c new file mode 100644 index 0000000000..459c1663ac --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -0,0 +1,828 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b; + pc = c + j; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc++; + } + } + b -= n; + a -= 2 * m; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 2; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc + j * 2; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + a += 8; + + pb = b; + pc0 = c + j * 8; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + pc4 = pc3 + 1; + pc5 = pc4 + 1; + pc6 = pc5 + 1; + pc7 = pc6 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - j) * vl * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c new file mode 100644 index 0000000000..bacfb2b08e --- /dev/null +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 + +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + *(b + j) = INV(*(ao + j * lda)); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c new file mode 100644 index 0000000000..0fc7c9f243 --- /dev/null +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j)); + + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + b += vl; + ao += lda; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c new file mode 100644 index 0000000000..ee869a7951 --- /dev/null +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j * lda)); + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c new file mode 100644 index 0000000000..a324b0fa6f --- /dev/null +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + *(b + j) = INV(*(ao + j)); + + ao += lda; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c new file mode 100644 index 0000000000..1917042be4 --- /dev/null +++ b/kernel/riscv64/zamax_rvv.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T v0, v1, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c new file mode 100644 index 0000000000..3f027383a0 --- /dev/null +++ b/kernel/riscv64/zamin_rvv.c @@ -0,0 +1,112 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T v0, v1, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c new file mode 100644 index 0000000000..7876646b32 --- /dev/null +++ b/kernel/riscv64/zasum_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v1 = VLEV_FLOAT(x+vl, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + else { + + int stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v1 = VLSEV_FLOAT(x+1, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + asumf += VFMVFS_FLOAT_M1(v_res); + + return(asumf); +} diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c new file mode 100644 index 0000000000..66f52d9d0b --- /dev/null +++ b/kernel/riscv64/zaxpby_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/*************************************************************************** +* 2014/06/07 Saar +* +***************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFMSACVF_FLOAT vfmsac_vf_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFMSACVF_FLOAT vfmsac_vf_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) +{ + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + BLASLONG stride_x = inc_x2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y2 * sizeof(FLOAT); + FLOAT_V_T vx0, vx1, vy0, vy1; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + size_t vl = VSETVL(n); + FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) + { + vl = VSETVL(n); + VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); + vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); + + vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); + + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + } + else + { + FLOAT_V_T v0, v1; + + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vy1, beta_i, vl); + v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); + + v1 = VFMULVF_FLOAT(vy1, beta_r, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); + v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); + v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); + v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); + + v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); + v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + } + return(0); + +} diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c new file mode 100644 index 0000000000..777bcb7287 --- /dev/null +++ b/kernel/riscv64/zaxpy_rvv.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if(n < 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1) { + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c new file mode 100644 index 0000000000..5d8322bbbd --- /dev/null +++ b/kernel/riscv64/zcopy_rvv.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_M8(n) vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 vle32_v_f32m8 +#define VSEV_FLOAT_M8 vse32_v_f32m8 + +#define VSETVL_M4(n) vsetvl_e32m4(n) +#define FLOAT_V_T_M4 vfloat32m4_t +#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 +#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 +#else +#define VSETVL_M8(n) vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 vle64_v_f64m8 +#define VSEV_FLOAT_M8 vse64_v_f64m8 + +#define VSETVL_M4(n) vsetvl_e64m4(n) +#define FLOAT_V_T_M4 vfloat64m4_t +#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 +#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + if(inc_x == 1 && inc_y == 1) { + + FLOAT_V_T_M8 vx; + n *= 2; // convert to words + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL_M8(n); + vx = VLEV_FLOAT_M8(x, vl); + VSEV_FLOAT_M8(y, vx, vl); + } + + }else if (1 == inc_x) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSEG_FLOAT_M4(&vr, &vi, x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } else if (1 == inc_y) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSEG_FLOAT_M4(y, vr, vi, vl); + } + } else { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c new file mode 100644 index 0000000000..7eae6f608e --- /dev/null +++ b/kernel/riscv64/zdot_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMSACVV_FLOAT vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMSACVV_FLOAT vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + OPENBLAS_COMPLEX_FLOAT result; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + + if ( n <= 0 ) return(result); + + FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax_m1 = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); + + size_t vlmax = VSETVL_MAX; + vr0 = VFMVVF_FLOAT(0, vlmax); + vr1 = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + + } else if (inc_x == 1){ + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } else if (inc_y == 1){ + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + }else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } + + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); + CREAL(result) = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); + CIMAG(result) = VFMVFS_FLOAT_M1(v_res); + + return(result); +} diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c new file mode 100644 index 0000000000..a89752d18e --- /dev/null +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFSUBVV_FLOAT vfsub_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFSUBVV_FLOAT vfsub_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vr, vi, v1, v2, v3, v4; + + ldc *= 2; + c_offset = c; + + if (beta_r == 0.0 && beta_i == 0.0) { + + vl = VSETVL(m); + vr = VFMVVF_FLOAT(0.0, vl); + vi = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VLSEG_FLOAT(&vr, &vi, c_offset, vl); + + v1 = VFMULVF_FLOAT(vr, beta_r, vl); + v2 = VFMULVF_FLOAT(vi, beta_i, vl); + + v3 = VFMULVF_FLOAT(vi, beta_r, vl); + v4 = VFMULVF_FLOAT(vr, beta_i, vl); + + vr = VFSUBVV_FLOAT(v1, v2, vl); + vi = VFADDVV_FLOAT(v3, v4, vl); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c new file mode 100644 index 0000000000..2eeb61b453 --- /dev/null +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix; + FLOAT *a_ptr; + FLOAT temp_r, temp_i; + FLOAT_V_T va0, va1, vy0, vy1; + + BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; + + BLASLONG inc_x2 = inc_x * 2; + BLASLONG lda2 = lda * 2; + if (inc_y == 1) + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } + else + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + return(0); +} diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c new file mode 100644 index 0000000000..b682d5cd88 --- /dev/null +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; + FLOAT temp_r, temp_i; + + FLOAT_V_T va0, va1, vx0, vx1, vr, vi; + FLOAT_V_T_M1 v_res, v_z0; + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + //BLASLONG stride_a = sizeof(FLOAT) * 2; + BLASLONG inc_y2 = inc_y * 2; + BLASLONG lda2 = lda * 2; + + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL(m); + + if (inc_x == 1) + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + } + else + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + + } + + + return(0); +} diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c new file mode 100644 index 0000000000..921ddb8cbd --- /dev/null +++ b/kernel/riscv64/znrm2_rvv.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m4 +#endif + +// TODO: Should single precision use the widening MAC, or perhaps all should be double? + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if ( n <= 0 ) return(0.0); + + FLOAT_V_T vr, v0, v1; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if (inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c new file mode 100644 index 0000000000..68066a00b4 --- /dev/null +++ b/kernel/riscv64/zrot_rvv.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + + return 0; +} diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c new file mode 100644 index 0000000000..079c36a2df --- /dev/null +++ b/kernel/riscv64/zscal_rvv.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if((n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T vt, vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + size_t vlmax = VSETVL_MAX; + + if(da_r == 0.0 && da_i == 0.0) { + + vr = VFMVVF_FLOAT(0.0, vlmax); + vi = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VSSEG_FLOAT(x, vr, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + } + + } else if(da_r == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vi, -da_i, vl); + vi = VFMULVF_FLOAT(vr, da_i, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + + } else if(da_i == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vr = VFMULVF_FLOAT(vr, da_r, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + + } else { + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vr, &vi, x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSEG_FLOAT(x, vt, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + } + } + + return(0); +} diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c new file mode 100644 index 0000000000..3928fbe276 --- /dev/null +++ b/kernel/riscv64/zsum_rvv.c @@ -0,0 +1,97 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + sumf += VFMVFS_FLOAT_M1(v_res); + + return(sumf); +} diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c new file mode 100644 index 0000000000..86f9103d34 --- /dev/null +++ b/kernel/riscv64/zswap_rvv.c @@ -0,0 +1,156 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[0]; + x[1] = y[1]; + y[0] = temp[0]; + y[1] = temp[1]; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[(n - 1) * inc_y * 2]; + x[0] = y[(n - 1) * inc_y * 2 + 1]; + FLOAT* ptr = y + (n - 1) * inc_y * 2; // start from the last one + BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + } + y[0] = temp[0]; + y[1] = temp[1]; + } + else if(inc_y == 0) { + FLOAT temp[2]; + temp[0] = y[0]; + temp[1] = y[1]; + y[0] = x[(n - 1) * inc_x * 2]; + y[0] = x[(n - 1) * inc_x * 2 + 1]; + FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one + BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + } + x[0] = temp[0]; + x[1] = temp[1]; + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c new file mode 100644 index 0000000000..3486a46480 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -0,0 +1,596 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c + + +/******************************** + ADD1 a*c + ADD2 b*c + ADD3 a*d + ADD4 b*d + *********************************/ +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, + FLOAT* C,BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T_M1 v_m1_res0, v_m1_res1; + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + C0 = C; + C1 = C0+2*ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 2; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 4; + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[2] = res0 * alphar - res1 * alphai; + C1[3] = res1 * alphar + res0 * alphai; +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + + ptrba += temp*2*2; + ptrbb += temp*2*2; + +#endif + +#ifdef LEFT + off += 2; +#endif + + C0 = C0+4; + C1 = C1+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 4; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + C1 = C1+2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 2; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 2; + + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp*2; + +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + } + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + return 0; +} diff --git a/param.h b/param.h index 514b13a3ac..62b675d6ce 100644 --- a/param.h +++ b/param.h @@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(x280) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2 +#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2 + +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If VLMAX size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8 +#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef C910V #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 From 5d0d1c555195a391fe5d029427dfbf7b942ecdf9 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 18:22:21 -0800 Subject: [PATCH 03/36] Remove redundant files --- Makefile.install | 5 - kernel/riscv64/KERNEL.x280 | 36 +- kernel/riscv64/gemm_ncopy_2_rvv.c | 92 --- kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ---- kernel/riscv64/gemm_tcopy_2_rvv.c | 108 ---- kernel/riscv64/gemm_tcopy_4_rvv.c | 236 -------- kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ------- kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ---------------- kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ----------- kernel/riscv64/trmmkernel_4x4_rvv.c | 881 ---------------------------- 10 files changed, 2 insertions(+), 2543 deletions(-) delete mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c diff --git a/Makefile.install b/Makefile.install index f1adaa2719..168d08f72f 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin -OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -39,7 +38,6 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" - @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -204,8 +202,5 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! -#Generating release tar - @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz - @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 2eb60f2b46..4d64354fb7 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -122,23 +122,7 @@ CTRMMKERNEL = ztrmmkernel_2x2_rvv.c ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c # SGEMM_UNROLL_N set in params.h -ifeq ($(SGEMM_UNROLL_N), 2) -SGEMMKERNEL = gemmkernel_2x2_rvv.c -SGEMMONCOPY = gemm_ncopy_2_rvv.c -SGEMMOTCOPY = gemm_tcopy_2_rvv.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMKERNEL = gemmkernel_4x4_rvv.c -SGEMMONCOPY = gemm_ncopy_4_rvv.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 8) +ifeq ($(SGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX SGEMMKERNEL = gemmkernel_rvv_v1x8.c SGEMMINCOPY = gemm_ncopy_rvv_v1.c @@ -162,23 +146,7 @@ SSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif # SGEMM_UNROLL_N set in params.h -ifeq ($(DGEMM_UNROLL_N), 2) -DGEMMKERNEL = gemmkernel_2x2_rvv.c -DGEMMONCOPY = gemm_ncopy_2_rvv.c -DGEMMOTCOPY = gemm_tcopy_2_rvv.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMKERNEL = gemmkernel_4x4_rvv.c -DGEMMONCOPY = gemm_ncopy_4_rvv.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 8) +ifeq ($(DGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX DGEMMKERNEL = gemmkernel_rvv_v1x8.c DGEMMINCOPY = gemm_ncopy_rvv_v1.c diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c deleted file mode 100644 index 5f55bc349e..0000000000 --- a/kernel/riscv64/gemm_ncopy_2_rvv.c +++ /dev/null @@ -1,92 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEG2_FLOAT vsseg2e32_v_f32m4 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEG2_FLOAT vsseg2e64_v_f64m4 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - FLOAT_V_T v1, v2; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - - for(j = (n >> 1); j > 0; j--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset, vl); - VSEV_FLOAT(b_offset, v1, vl); - - a_offset += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c deleted file mode 100644 index 4d4efe4c95..0000000000 --- a/kernel/riscv64/gemm_ncopy_4_rvv.c +++ /dev/null @@ -1,123 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG4_FLOAT vsseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG4_FLOAT vsseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset; - - FLOAT_V_T v1, v2, v3, v4; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - for(j = (n >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - v3 = VLEV_FLOAT(a_offset3, vl); - v4 = VLEV_FLOAT(a_offset4, vl); - - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); - - a_offset1 += vl; - a_offset2 += vl; - a_offset3 += vl; - a_offset4 += vl; - b_offset += vl*4; - } - } - - if (n & 2) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - a_offset1 = a_offset; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - - VSEV_FLOAT(b_offset, v1, vl); - - a_offset1 += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c deleted file mode 100644 index 963e1be695..0000000000 --- a/kernel/riscv64/gemm_tcopy_2_rvv.c +++ /dev/null @@ -1,108 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset, *b_offset1, *b_offset2; - FLOAT_V_T v1a, v1b, v2a, v2b; - size_t vl; - - //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - b_offset2 = b + m * (n & ~1); - - for(i = (m >> 1); i > 0; i--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 4; - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); - VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); - - VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); - - a_offset1 += vl * 2; - a_offset2 += vl * 2; - b_offset1 += vl * m * 2; - } - - if (n & 1) { - *(b_offset2 + 0) = *(a_offset1 + 0); - *(b_offset2 + 1) = *(a_offset2 + 0); - b_offset2 += 2; - } - } - - if (m & 1) { - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); - - VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); - - a_offset += vl * 2; - b_offset += vl * m * 2; - } - - if (n & 1){ - *(b_offset2 + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c deleted file mode 100644 index ac9974b24b..0000000000 --- a/kernel/riscv64/gemm_tcopy_4_rvv.c +++ /dev/null @@ -1,236 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; - FLOAT ctemp1, ctemp2, ctemp3, ctemp4; - FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - - //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - b_offset2 = b + m * (n & ~3); - b_offset3 = b + m * (n & ~1); - - for(j = (m >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - b_offset1 = b_offset; - b_offset += 16; - - for(i = (n >> 2); i > 0; i--) { - v1 = VLEV_FLOAT(a_offset1, 4); - v2 = VLEV_FLOAT(a_offset2, 4); - v3 = VLEV_FLOAT(a_offset3, 4); - v4 = VLEV_FLOAT(a_offset4, 4); - - a_offset1 += 4; - a_offset2 += 4; - a_offset3 += 4; - a_offset4 += 4; - - VSEV_FLOAT(b_offset1, v1, 4); - VSEV_FLOAT(b_offset2+4, v2, 4); - VSEV_FLOAT(b_offset2+8, v3, 4); - VSEV_FLOAT(b_offset2+12, v4, 4); - - b_offset1 += m * 4; - } - - if (n & 2) { - v1 = VLEV_FLOAT(a_offset1, 2); - v2 = VLEV_FLOAT(a_offset2, 2); - v3 = VLEV_FLOAT(a_offset3, 2); - v4 = VLEV_FLOAT(a_offset4, 2); - - a_offset1 += 2; - a_offset2 += 2; - a_offset3 += 2; - a_offset4 += 2; - - VSEV_FLOAT(b_offset2, v1, 2); - VSEV_FLOAT(b_offset2+2, v2, 2); - VSEV_FLOAT(b_offset2+4, v3, 2); - VSEV_FLOAT(b_offset2+6, v4, 2); - - b_offset2 += 8; - } - - if (n & 1) { - v1 = VLEV_FLOAT(a_offset1, 1); - v2 = VLEV_FLOAT(a_offset2, 1); - v3 = VLEV_FLOAT(a_offset3, 1); - v4 = VLEV_FLOAT(a_offset4, 1); - - VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); - - b_offset3 += 4; - } - - } - -// TODO cleanup - - if (m & 2){ - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 8; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - ctemp5 = *(a_offset2 + 0); - ctemp6 = *(a_offset2 + 1); - ctemp7 = *(a_offset2 + 2); - ctemp8 = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - *(b_offset1 + 4) = ctemp5; - *(b_offset1 + 5) = ctemp6; - *(b_offset1 + 6) = ctemp7; - *(b_offset1 + 7) = ctemp8; - - b_offset1 += m * 4; - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - - ctemp3 = *(a_offset2 + 0); - ctemp4 = *(a_offset2 + 1); - - a_offset1 += 2; - a_offset2 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - *(b_offset2 + 2) = ctemp3; - *(b_offset2 + 3) = ctemp4; - - b_offset2 += 4; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset2 + 0); - - *(b_offset3 + 0) = ctemp1; - *(b_offset3 + 1) = ctemp2; - b_offset3 += 2; - } - } - - if (m & 1){ - a_offset1 = a_offset; - b_offset1 = b_offset; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - a_offset1 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - b_offset1 += 4 * m; - - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - a_offset1 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - *(b_offset3 + 0) = ctemp1; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c deleted file mode 100644 index ec8961ced7..0000000000 --- a/kernel/riscv64/gemmkernel_2x2_rvv.c +++ /dev/null @@ -1,214 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG2_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG2_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/2; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c deleted file mode 100644 index aa58bcc766..0000000000 --- a/kernel/riscv64/gemmkernel_4x4_rvv.c +++ /dev/null @@ -1,508 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VLSEG4_FLOAT vlseg4e32_v_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m1 -#define VFMACCVF_FLOAT vfmacc_vf_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m1(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VLSEG4_FLOAT vlseg4e64_v_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m1 -#define VFMACCVF_FLOAT vfmacc_vf_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU - - FLOAT_V_T va0, va1, va2, va3; - FLOAT_V_T vb0, vb1, vb2, vb3; - FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; - FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/4; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - C2 = C1 + ldc; - C3 = C2 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - vres8 = VFMVVF_FLOAT(0.0, vlmax); - vres9 = VFMVVF_FLOAT(0.0, vlmax); - vres10 = VFMVVF_FLOAT(0.0, vlmax); - vres11 = VFMVVF_FLOAT(0.0, vlmax); - vres12 = VFMVVF_FLOAT(0.0, vlmax); - vres13 = VFMVVF_FLOAT(0.0, vlmax); - vres14 = VFMVVF_FLOAT(0.0, vlmax); - vres15 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); - vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); - vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); - vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); - - vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); - vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); - vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); - vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); - - ptrba += vl*4; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - C2 += 4; - C3 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - ptrba += vl*2; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - C1 += 2; - C2 += 2; - C3 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); - vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 1; - C1 += 1; - C2 += 1; - C3 += 1; - } - - bb += (bk<<2); - C += (ldc<<2); - } - - if(bn & 2) { - - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); - vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); - vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); - - ptrba += vl*4; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - ptrba += vl*4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c deleted file mode 100644 index 127e76970a..0000000000 --- a/kernel/riscv64/trmmkernel_2x2_rvv.c +++ /dev/null @@ -1,342 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_2x2.c - - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - BLASLONG off, temp; - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 v_res, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - -#if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; -#else - off = 0; -#endif - - for (j = bn/2; j > 0; j--) - { - C0 = C; - C1 = C0+ldc; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); - C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 2; -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 2; -#endif - C0 = C0+2; - C1 = C1+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 2; -#endif - ptrba += temp; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - C1 = C1+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - if (bn & 1) - { - C0 = C; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off; -#endif - - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - vb0 = VLEV_FLOAT(ptrbb, vl); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); - - ptrba += vl * 2; - ptrbb += vl; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 1; -#endif - ptrba += temp*2; - ptrbb += temp; -#endif -#ifdef LEFT - off += 2; -#endif - - C0 = C0+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off + 1; -#else - temp = off + 1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - ptrba += vl; - ptrbb += vl; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 1; -#endif - ptrba += temp; - ptrbb += temp; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} - diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c deleted file mode 100644 index 3e46c6348b..0000000000 --- a/kernel/riscv64/trmmkernel_4x4_rvv.c +++ /dev/null @@ -1,881 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMUL_FLOAT vfmul_vv_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMUL_FLOAT vfmul_vv_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_4x4.c - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - - FLOAT_V_T vres0_0; - FLOAT_V_T vres0_1; - FLOAT_V_T vres0_2; - FLOAT_V_T vres0_3; - - FLOAT_V_T vres1_0; - FLOAT_V_T vres1_1; - FLOAT_V_T vres1_2; - FLOAT_V_T vres1_3; - - FLOAT_V_T vres2_0; - FLOAT_V_T vres2_1; - FLOAT_V_T vres2_2; - FLOAT_V_T vres2_3; - - FLOAT_V_T vres3_0; - FLOAT_V_T vres3_1; - FLOAT_V_T vres3_2; - FLOAT_V_T vres3_3; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - - for (j=0; j 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); - vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); - vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); - - ptrba += vl * 4; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - if (!backwards) { - temp = bk-off; - temp = left ? temp - 4 : // number of values in A - temp - 4; // number of values in B - - ptrba += temp*4; // number of values in A - ptrbb += temp*4; // number of values in B - } -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - C2 = C2+4; - C3 = C3+4; - - } - - if ( bm & 2 ) // do any 2x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres2_1 = VFMVVF_FLOAT(0, vlmax); - - vres3_0 = VFMVVF_FLOAT(0, vlmax); - vres3_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+4; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - ptrba += vl * 2; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); - - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - C2 = C2+2; - C3 = C3+2; - - } - - if ( bm & 1 ) // do any 1x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres3_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+4; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - C2 = C2+1; - C3 = C3+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; -#endif - - k = (bk<<2); - bb = bb+k; - i = (ldc<<2); - C = C+i; - } - - for (j=0; j<(bn&2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - - ptrba += vl * 4; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) // do any 2x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+2; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) // do any 1x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*2; -#endif - - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - - ptrba += vl * 4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - ptrba += vl * 2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - - } - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} From 9702d57b11351a5360a2f0326c69c3f550c784d2 Mon Sep 17 00:00:00 2001 From: HellerZheng Date: Wed, 16 Nov 2022 11:11:04 +0800 Subject: [PATCH 04/36] Update Makefile.install --- Makefile.install | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index 168d08f72f..87b5bc8701 100644 --- a/Makefile.install +++ b/Makefile.install @@ -202,5 +202,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - - From 3918d8504e7720d94221025ae6078a2459ccb104 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 21 Nov 2022 19:06:07 -0800 Subject: [PATCH 05/36] nrm2 simple optimization --- kernel/riscv64/nrm2_rvv.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 3f5d50397e..979c316481 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -39,9 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT vfabs_v_f32m8 #define ABS fabsf #else #define VSETVL(n) vsetvl_e64m8(n) @@ -54,9 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT vfabs_v_f64m8 #define ABS fabs #endif @@ -68,12 +64,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if(n == 1) return (ABS(x[0])); FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_max, v_res; - FLOAT scale = 0.0, ssq = 0.0; + FLOAT_V_T_M1 v_res; + FLOAT ssq = 0.0; size_t vlmax = VSETVL_MAX; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); vr = VFMVVF_FLOAT(0, vlmax); @@ -83,9 +78,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLEV_FLOAT(x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } @@ -98,20 +90,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } - } v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - return(scale * sqrt(ssq)); + return sqrt(ssq); } From 387e8970cd8ce581a6c7bc48418860966140f621 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 28 Nov 2022 21:42:29 -0800 Subject: [PATCH 06/36] Fix merge problem; Update compiling COMMON_OPT per review comments. --- Makefile.prebuild | 2 +- Makefile.riscv64 | 6 +++--- common_riscv64.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index e6a8eab597..c4f4a26026 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -56,7 +56,7 @@ TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif ifeq ($(TARGET), x280) -TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif ifeq ($(TARGET), RISCV64_GENERIC) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index d6eaf552d6..d091984a6c 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,10 +3,10 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math -FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static -endif \ No newline at end of file +endif diff --git a/common_riscv64.h b/common_riscv64.h index 221a799016..2092bd5abc 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) -#include +#include #endif #if defined(x280) From c19dff0a31c58163dc386b0c4270e75f576b97be Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 25 Jan 2023 19:33:32 +0800 Subject: [PATCH 07/36] Fix T-Head RVV intrinsic API changes. --- kernel/riscv64/amax_vector.c | 8 ++++---- kernel/riscv64/amin_vector.c | 8 ++++---- kernel/riscv64/asum_vector.c | 8 ++++---- kernel/riscv64/axpby_vector.c | 16 ++++++++-------- kernel/riscv64/axpy_vector.c | 16 ++++++++-------- kernel/riscv64/copy_vector.c | 16 ++++++++-------- kernel/riscv64/dot_vector.c | 8 ++++---- kernel/riscv64/gemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/gemv_t_vector.c | 8 ++++---- kernel/riscv64/iamax_vector.c | 8 ++++---- kernel/riscv64/iamin_vector.c | 8 ++++---- kernel/riscv64/imax_vector.c | 8 ++++---- kernel/riscv64/imin_vector.c | 8 ++++---- kernel/riscv64/izamax_vector.c | 4 ++-- kernel/riscv64/izamin_vector.c | 4 ++-- kernel/riscv64/max_vector.c | 8 ++++---- kernel/riscv64/min_vector.c | 8 ++++---- kernel/riscv64/nrm2_vector.c | 8 ++++---- kernel/riscv64/rot_vector.c | 16 ++++++++-------- kernel/riscv64/scal_vector.c | 16 ++++++++-------- kernel/riscv64/swap_vector.c | 16 ++++++++-------- kernel/riscv64/symv_L_vector.c | 16 ++++++++-------- kernel/riscv64/symv_U_vector.c | 16 ++++++++-------- kernel/riscv64/zamax_vector.c | 4 ++-- kernel/riscv64/zamin_vector.c | 4 ++-- kernel/riscv64/zasum_vector.c | 8 ++++---- kernel/riscv64/zaxpby_vector.c | 8 ++++---- kernel/riscv64/zaxpy_vector.c | 8 ++++---- kernel/riscv64/zcopy_vector.c | 8 ++++---- kernel/riscv64/zdot_vector.c | 8 ++++---- kernel/riscv64/zgemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/zgemv_t_vector.c | 4 ++-- kernel/riscv64/zhemv_LM_vector.c | 8 ++++---- kernel/riscv64/zhemv_UV_vector.c | 8 ++++---- kernel/riscv64/znrm2_vector.c | 8 ++++---- kernel/riscv64/zrot_vector.c | 16 ++++++++-------- kernel/riscv64/zscal_vector.c | 8 ++++---- kernel/riscv64/zswap_vector.c | 16 ++++++++-------- 38 files changed, 190 insertions(+), 190 deletions(-) diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b778d3e55f..1b77993400 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index fd2f83dc9d..f9b7defaea 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index a822751535..fc73362bc1 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 988c57ec23..676dfd4745 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 98b9f6814a..6f921f2d6e 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -30,18 +30,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index a46136d6cf..fee5e195da 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c index 64efc6c40d..f47e0c0b5d 100644 --- a/kernel/riscv64/dot_vector.c +++ b/kernel/riscv64/dot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -45,8 +45,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index 32ca8618b2..bb9ab8e5a8 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 7683641fa1..7d0b70cbbc 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index ecb4cd7a9b..9fea522f7f 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -60,8 +60,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index c72bb94cca..4e81e78484 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index c2d787ab8f..ca48a3c48e 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index dfe9a33104..2a677098d3 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index fdbdc3ae8b..66a101566f 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 59c7203106..818193a9e0 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index b988513c90..7f31e9a530 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index be0803df60..14b7e01ed1 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c index 2a83e2a521..cf6fdb741e 100644 --- a/kernel/riscv64/nrm2_vector.c +++ b/kernel/riscv64/nrm2_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32 #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -55,8 +55,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64 #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c index 9b48d1c699..43a65e552a 100644 --- a/kernel/riscv64/rot_vector.c +++ b/kernel/riscv64/rot_vector.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFMSACVF_FLOAT vfmsac_vf_f32m4 @@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFMSACVF_FLOAT vfmsac_vf_f64m4 diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c index 7a3153b7cd..8b9ef5a3e4 100644 --- a/kernel/riscv64/scal_vector.c +++ b/kernel/riscv64/scal_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #define VFMULVF_FLOAT vfmul_vf_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #define VFMULVF_FLOAT vfmul_vf_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #endif diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index d9421e2f10..b165928088 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index 6588f4dda8..58ec17b03d 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,10 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 31104eae6d..34ff0e30ac 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 9dbeba90f5..bfb282ae06 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index dc58075ac4..d9eca7f102 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 8386ab62e3..0d1cc42f10 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 3eca20415c..5e6034ac58 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 303d3541e5..4ccfe4a814 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 600f02bba2..55a480a357 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #endif diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index ec38ed9d23..0900206b34 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index b5ee1f054f..3095c28f90 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index e930dc2a22..a7a8a52796 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index 275ee9131b..0a284a9991 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 2f46977d44..33b7c9c25a 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index 59d0e219df..cadabdb75f 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -53,8 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index 2fdd8135a5..858dfd1732 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 64323aa3a1..d275b75f81 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index 7550294b5a..c1dcaccab6 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) From 63cf4d01668f8f6c73a05039bc36785ba78b0940 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Wed, 1 Feb 2023 19:13:44 -0800 Subject: [PATCH 08/36] add riscv level3 C,Z kernel functions. --- kernel/riscv64/KERNEL.x280 | 85 +++- kernel/riscv64/trmm_lncopy_rvv_v1.c | 8 +- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 644 +++--------------------- kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 658 +++---------------------- kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 610 +++-------------------- kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 623 +++-------------------- kernel/riscv64/zgemm_ncopy_4_rvv.c | 121 +++++ kernel/riscv64/zgemm_ncopy_rvv_v1.c | 74 +++ kernel/riscv64/zgemm_tcopy_4_rvv.c | 181 +++++++ kernel/riscv64/zgemm_tcopy_rvv_v1.c | 74 +++ kernel/riscv64/zgemmkernel_rvv_v1x4.c | 475 ++++++++++++++++++ kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 124 +++++ kernel/riscv64/zhemm_utcopy_rvv_v1.c | 120 +++++ kernel/riscv64/zsymm_lcopy_rvv_v1.c | 106 ++++ kernel/riscv64/zsymm_ucopy_rvv_v1.c | 106 ++++ kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 145 ++++++ kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 143 ++++++ kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 144 ++++++ kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 140 ++++++ kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 574 +++++++++++++++++++++ kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 115 +++++ kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 114 +++++ kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 113 +++++ kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 115 +++++ param.h | 10 +- 25 files changed, 3342 insertions(+), 2280 deletions(-) create mode 100644 kernel/riscv64/zgemm_ncopy_4_rvv.c create mode 100644 kernel/riscv64/zgemm_ncopy_rvv_v1.c create mode 100644 kernel/riscv64/zgemm_tcopy_4_rvv.c create mode 100644 kernel/riscv64/zgemm_tcopy_rvv_v1.c create mode 100644 kernel/riscv64/zgemmkernel_rvv_v1x4.c create mode 100644 kernel/riscv64/zhemm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/zhemm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/zsymm_lcopy_rvv_v1.c create mode 100644 kernel/riscv64/zsymm_ucopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmmkernel_rvv_v1x4.c create mode 100644 kernel/riscv64/ztrsm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_utcopy_rvv_v1.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 4d64354fb7..217d8534e5 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -118,8 +118,8 @@ DGEMVTKERNEL = gemv_t_rvv.c CGEMVTKERNEL = zgemv_t_rvv.c ZGEMVTKERNEL = zgemv_t_rvv.c -CTRMMKERNEL = ztrmmkernel_2x2_rvv.c -ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c +CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c +ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c # SGEMM_UNROLL_N set in params.h ifeq ($(SGEMM_UNROLL_N), 8) @@ -168,17 +168,28 @@ DSYMMUCOPY_M = symm_ucopy_rvv_v1.c DSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMKERNEL = zgemmkernel_rvv_v1x4.c +CGEMMINCOPY = zgemm_ncopy_rvv_v1.c +CGEMMITCOPY = zgemm_tcopy_rvv_v1.c +CGEMMONCOPY = zgemm_ncopy_4_rvv.c +CGEMMOTCOPY = zgemm_tcopy_4_rvv.c -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c + +ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c +ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c +ZGEMMONCOPY = zgemm_ncopy_4_rvv.c +ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c @@ -190,20 +201,25 @@ DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c -TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c -TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c -TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c +ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c +ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c SSYMV_U_KERNEL = symv_U_rvv.c SSYMV_L_KERNEL = symv_L_rvv.c @@ -214,6 +230,27 @@ CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c +ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c + +CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c LSAME_KERNEL = ../generic/lsame.c diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c index 73a8233f8d..3457ca3e1b 100644 --- a/kernel/riscv64/trmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT vse32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 #define VBOOL_T vbool16_t -#define UINT_V_T vint32m2_t -#define VID_V_UINT vid_v_i32m2 -#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 #define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 11a0398ca1..2cba06b386 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m2(n) #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 #define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -88,606 +91,107 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - FLOAT *pa, *pc; + FLOAT aa; + FLOAT* pc; int i, j, k; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug - size_t vl; - FLOAT_V_T va, vc; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - for (j = 0; j < n; j ++) - { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - pa = a; - pc = c + j * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLEV_FLOAT(pc, vl); - va = VLEV_FLOAT(pa, vl); - vc = VFNMSACVF_FLOAT(vc, bb, va, vl); - VSEV_FLOAT(pc, vc, vl); - pa += vl; - pc += vl; - } - } - a -= m; - b -= 2 * n; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - FLOAT aa, bb0, bb1; - FLOAT *pa, *pc, *pc0, *pc1; - FLOAT *pb0, *pb1; - - int i, j, k; - fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + FLOAT_V_T vb, vc; size_t vl; - FLOAT_V_T va, vc0, vc1; a += (m - 1) * m; b += (m - 1) * n; - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/2; j ++) - { - //bb = *(c + i + j * ldc); - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - //bb *= aa; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - //*b = bb; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - - //*(c + i + j * ldc) = bb; - //b ++; - - b += 2; - //pa = a + i + 1; - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - pa = a; - //pc = c + j * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - } - pc += ldc * (n/2) * 2; - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - - a -= m; - b -= 2 * n; - } - -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb0, bb1, bb2, bb3; - FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; - FLOAT *pb0, *pb1, *pb2, *pb3; - - int i, j, k; - - size_t vl; - FLOAT_V_T va, vc0, vc1, vc2, vc3; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/4; j ++) - { - pb0 = pc + j * ldc * 4; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - - b += 4; - - pc0 = c + j * ldc * 4; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - } - } - pc += ldc * (n/4) * 4; - - if (n & 2) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - - *b = bb0; - *(b+1) = bb1; - - *pb0 = bb0; - *pb1 = bb1; - - b += 2; - - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - - a -= m; - b -= 2 * n; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; - FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; - - int i, j, k; - - size_t vl; - FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - a += (m - 1) * m; - b += (m - 1) * n; + for (i = m - 1; i >= 0; i--) { - for (i = m - 1; i >= 0; i--) - { aa = *(a + i); - pc = c + i; - for (j = 0; j < n/8; j ++) - { - pb0 = pc + j * ldc * 8; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - pb4 = pb3 + ldc; - pb5 = pb4 + ldc; - pb6 = pb5 + ldc; - pb7 = pb6 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - bb4 = (*pb4) * aa; - bb5 = (*pb5) * aa; - bb6 = (*pb6) * aa; - bb7 = (*pb7) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - *(b+4) = bb4; - *(b+5) = bb5; - *(b+6) = bb6; - *(b+7) = bb7; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - *pb4 = bb4; - *pb5 = bb5; - *pb6 = bb6; - *pb7 = bb7; - - b += 8; - - pc0 = c + j * ldc * 8; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - pc4 = pc3 + ldc; - pc5 = pc4 + ldc; - pc6 = pc5 + ldc; - pc7 = pc6 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - vc4 = VLEV_FLOAT(pc4, vl); - vc5 = VLEV_FLOAT(pc5, vl); - vc6 = VLEV_FLOAT(pc6, vl); - vc7 = VLEV_FLOAT(pc7, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); - vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); - vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); - vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - VSEV_FLOAT(pc4, vc4, vl); - VSEV_FLOAT(pc5, vc5, vl); - VSEV_FLOAT(pc6, vc6, vl); - VSEV_FLOAT(pc7, vc7, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - pc4 += vl; - pc5 += vl; - pc6 += vl; - pc7 += vl; - } - } - pc += ldc * (n/8) * 8; - - if (n & 4) - { - pb0 = pc + j * ldc * 4; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - - b += 4; - - pc0 = c + j * ldc * 4; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = 0; k < i; k ++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); } - pc += ldc * 4; + pc += vl * ldc; } - - if (n & 2) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - - *b = bb0; - *(b+1) = bb1; - - *pb0 = bb0; - *pb1 = bb1; - - b += 2; - - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - a -= m; b -= 2 * n; } } -#else -static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - - int i, j, k; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) { - - aa = *(a + i); - - for (j = 0; j < n; j ++) { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - for (k = 0; k < i; k ++){ - *(c + k + j * ldc) -= bb * *(a + k); - } - - } - a -= m; - b -= 2 * n; - } - -} - -#endif - #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; - ldc *= 2; - a += (m - 1) * m * 2; - b += (m - 1) * n * 2; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; - for (i = m - 1; i >= 0; i--) { + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); + for (i = m - 1; i >= 0; i--) { - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif + VSSEG2_FLOAT(b, vs1, vs2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + b += vl * 2; - - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - for (k = 0; k < i; k ++){ + for (k = 0; k < i; k ++) { + VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); #ifndef CONJ - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#else - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - } - + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + } + pc += vl * ldc * 2; + } + a -= m * 2; + b -= 4 * n; } - a -= m * 2; - b -= 4 * n; - } - } + #endif diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index 0380bd1bbe..492a5631fa 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m2(n) #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 #define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -87,468 +90,39 @@ static FLOAT dm1 = -1.; // Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ - FLOAT aa, bb; - FLOAT *pa, *pc; - int i, j, k; - size_t vl; - FLOAT_V_T va, vc; - for (i = 0; i < m; i++) - { - aa = *(a + i); - for (j = 0; j < n; j ++) - { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b++; - pa = a + i + 1; - pc = c + j * ldc + i + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLEV_FLOAT(pc, vl); - va = VLEV_FLOAT(pa, vl); - vc = VFNMSACVF_FLOAT(vc, bb, va, vl); - VSEV_FLOAT(pc, vc, vl); - pa += vl; - pc += vl; - } - } - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb0, bb1; - FLOAT *pa, *pc, *pc0, *pc1; - FLOAT *pb0, *pb1; + FLOAT aa; + FLOAT* pc; int i, j, k; - size_t vl; - FLOAT_V_T va, vc0, vc1; - for (i = 0; i < m; i++) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/2; j ++) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - } - pc += ldc * (n/2) * 2; - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 4 -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - FLOAT aa, bb0, bb1, bb2, bb3; - FLOAT *pa, *pc; - FLOAT *pc0, *pc1, *pc2, *pc3; - FLOAT *pb0, *pb1, *pb2, *pb3; + FLOAT_V_T vb, vc; - int i, j, k; size_t vl; - FLOAT_V_T va; - FLOAT_V_T vc0, vc1, vc2, vc3; - for (i = 0; i < m; i++) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/4; j ++) - { - pb0 = pc; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - b += 4; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - } - } - pc += ldc * (n/4) * 4; - - if (n & 2) - { - pb0 = pc; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ - - FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; - FLOAT *pa, *pc; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + for (i = 0; i < m; i++) { - int i, j, k; - size_t vl; - FLOAT_V_T va; - FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - for (i = 0; i < m; i++) - { aa = *(a + i); - pc = c + i; - for (j = 0; j < n/8; j ++) - { - pb0 = pc + j * ldc * 8; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - pb4 = pb3 + ldc; - pb5 = pb4 + ldc; - pb6 = pb5 + ldc; - pb7 = pb6 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - bb4 = (*pb4) * aa; - bb5 = (*pb5) * aa; - bb6 = (*pb6) * aa; - bb7 = (*pb7) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - *(b+4) = bb4; - *(b+5) = bb5; - *(b+6) = bb6; - *(b+7) = bb7; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - *pb4 = bb4; - *pb5 = bb5; - *pb6 = bb6; - *pb7 = bb7; - b += 8; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - pc4 = pc3 + ldc; - pc5 = pc4 + ldc; - pc6 = pc5 + ldc; - pc7 = pc6 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - vc4 = VLEV_FLOAT(pc4, vl); - vc5 = VLEV_FLOAT(pc5, vl); - vc6 = VLEV_FLOAT(pc6, vl); - vc7 = VLEV_FLOAT(pc7, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); - vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); - vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); - vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - VSEV_FLOAT(pc4, vc4, vl); - VSEV_FLOAT(pc5, vc5, vl); - VSEV_FLOAT(pc6, vc6, vl); - VSEV_FLOAT(pc7, vc7, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - pc4 += vl; - pc5 += vl; - pc6 += vl; - pc7 += vl; - } - } - pc += ldc * (n/8) * 8; - - if (n & 4) - { - pb0 = pc; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - b += 4; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = i + 1; k < m; k++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); } - pc += ldc * 4; + pc += vl * ldc; } - - if (n & 2) - { - pb0 = pc; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - a += m; } } @@ -557,146 +131,60 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - - int i, j, k; - - for (i = 0; i < m; i++) { - - aa = *(a + i); - - for (j = 0; j < n; j ++) { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - for (k = i + 1; k < m; k ++){ - *(c + k + j * ldc) -= bb * *(a + k); - } - - } - a += m; - } -} - -#endif - -#else - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; - - ldc *= 2; - - for (i = 0; i < m; i++) { - - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); - - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); - -#ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; -#else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; -#endif - - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - for (k = i + 1; k < m; k ++){ -#ifndef CONJ - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#else - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#endif - } - - } - a += m * 2; - } -} - - -static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - FLOAT *pa, *pc; - - int i, j, k; - - size_t vl; - FLOAT_V_T va0, va1, vc0, vc1; + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; - ldc *= 2; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; - for (i = 0; i < m; i++) { + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); + ldc *= 2; - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); + for (i = 0; i < m; i++) { + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif + VSSEG2_FLOAT(b, vs1, vs2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + b += vl * 2; - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - pa = a + (i + 1) * 2; - pc = c + j * ldc + (i + 1) * 2; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, pa, vl); - VLSEG2_FLOAT(&vc0, &vc1, pc, vl); + for (k = i + 1; k < m; k++) { + VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); #ifndef CONJ - vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); - vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); -#else - vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); - vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); - vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pc, vc0, vc1, vl); - pa += vl * 2; - pc += vl * 2; + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + } + pc += vl * ldc * 2; } - } + + a += m * 2; } - a += m * 2; - } } #endif @@ -714,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug j = (n >> GEMM_UNROLL_N_SHIFT); diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 41368be600..4751ae012f 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -32,28 +32,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t #define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t #define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif - static FLOAT dm1 = -1.; #ifdef CONJ @@ -86,569 +90,99 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - FLOAT *pb, *pc; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - - for (j = 0; j < m; j ++) - { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - pb = b + i + 1; - pc = c + j + (i + 1) *ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLSEV_FLOAT(pc, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); - VSSEV_FLOAT(pc, stride_ldc, vc, vl); - pb += vl; - pc ++; - } - } - b += n; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT bb; + FLOAT *pci, *pcj; - FLOAT aa0, aa1, bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pc0, *pc1; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/2; j ++) - { - pa0 = pc + j * 2; - pa1 = pc + j * 2 + 1; - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - pb += vl; - pc0++; - pc1++; - } - } - pc += (m/2)*2; - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b += n; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT_V_T va, vc; - FLOAT bb; - FLOAT aa0, aa1, aa2, aa3; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3; - FLOAT *pc0, *pc1, *pc2, *pc3; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3; + for (i = 0; i < n; i++) { - for (i = 0; i < n; i++) - { bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/4; j ++) - { - pa0 = pc + j * 4; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - - a += 4; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - } - pc += (m/4)*4; - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = i + 1; k < n; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); } + pcj += vl; } b += n; } } -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT bb; - FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/8; j ++) - { - pa0 = pc + j * 8; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - pa4 = pa3 + 1; - pa5 = pa4 + 1; - pa6 = pa5 + 1; - pa7 = pa6 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - aa4 = *pa4 * bb; - aa5 = *pa5 * bb; - aa6 = *pa6 * bb; - aa7 = *pa7 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - *pa4 = aa4; - *pa5 = aa5; - *pa6 = aa6; - *pa7 = aa7; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - *(a + 4)= aa4; - *(a + 5)= aa5; - *(a + 6)= aa6; - *(a + 7)= aa7; - - a += 8; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - pc4 = pa4 + ldc; - pc5 = pa5 + ldc; - pc6 = pa6 + ldc; - pc7 = pa7 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); - vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); - vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); - vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); - vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); - vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); - vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); - VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); - VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); - VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - pc4++; - pc5++; - pc6++; - pc7++; - } - } - pc += (m/8)*8; - - if (m & 4) - { - pa0 = pc; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - - a += 4; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - pc += 4; - } - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b += n; - } -} #else -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - int i, j, k; - - for (i = 0; i < n; i++) { - - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - for (k = i + 1; k < n; k ++){ - *(c + j + k * ldc) -= aa * *(b + k); - } - - } - b += n; - } -} - -#endif +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { -#else + FLOAT bb1, bb2; -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *pci, *pcj; - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; + int i, j, k; - int i, j, k; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; - ldc *= 2; + size_t vl; - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) { - bb1 = *(b + i * 2 + 0); - bb2 = *(b + i * 2 + 1); + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); - for (j = 0; j < m; j ++) { - aa1 = *(c + j * 2 + 0 + i * ldc); - aa2 = *(c + j * 2 + 1 + i * ldc); + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + VLSEG2_FLOAT(&va1, &va2, pci, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = -aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif + VSSEG2_FLOAT(a, vs1, vs2, vl); + VSSEG2_FLOAT(pci, vs1, vs2, vl); + a += vl * 2; + pci += vl * 2; - *(a + 0) = cc1; - *(a + 1) = cc2; - *(c + j * 2 + 0 + i * ldc) = cc1; - *(c + j * 2 + 1 + i * ldc) = cc2; - a += 2; - - for (k = i + 1; k < n; k ++){ + for (k = i + 1; k < n; k ++){ + VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); #ifndef CONJ - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #else - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - } - + VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + } + pcj += vl * 2; + } + b += n * 2; } - b += n * 2; - } } #endif @@ -666,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug j = (n >> GEMM_UNROLL_N_SHIFT); diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index 459c1663ac..93a9e69169 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -32,25 +32,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t #define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSEV_FLOAT vse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t #define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSEV_FLOAT vse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -86,497 +85,38 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - FLOAT *pb, *pc; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc; - - a += (n - 1) * m; - b += (n - 1) * n; - - for (i = n - 1; i >= 0; i--) { - - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - pb = b; - pc = c + j; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLSEV_FLOAT(pc, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); - VSSEV_FLOAT(pc, stride_ldc, vc, vl); - pb += vl; - pc++; - } - } - b -= n; - a -= 2 * m; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT bb; + FLOAT *pci, *pcj; - FLOAT aa0, aa1, bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pc0, *pc1; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1; - - a += (n - 1) * m; - b += (n - 1) * n; + FLOAT_V_T va, vc; - for (i = n - 1; i >= 0; i--) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/2; j ++) - { - pa0 = pc + j * 2; - pa1 = pc + j * 2 + 1; - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = c + j * 2; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - pb += vl; - pc0++; - pc1++; - } - } - pc += (m/2)*2; - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b -= n; - a -= 2 * m; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa0, aa1, aa2, aa3; - FLOAT bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3; - FLOAT *pc0, *pc1, *pc2, *pc3; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3; a += (n - 1) * m; b += (n - 1) * n; - for (i = n - 1; i >= 0; i--) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/4; j ++) - { - pa0 = pc + j * 4; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - a += 4; - - pb = b; - pc0 = c + j * 4; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - } - pc += (m/4)*4; - - if (m & 2) - { - pa0 = pc + j * 2; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = c + j * 4; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b -= n; - a -= 2 * m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; - FLOAT bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - a += (n - 1) * m; - b += (n - 1) * n; + for (i = n - 1; i >= 0; i--) { - for (i = n - 1; i >= 0; i--) - { bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/8; j ++) - { - pa0 = pc + j * 8; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - pa4 = pa3 + 1; - pa5 = pa4 + 1; - pa6 = pa5 + 1; - pa7 = pa6 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - aa4 = *pa4 * bb; - aa5 = *pa5 * bb; - aa6 = *pa6 * bb; - aa7 = *pa7 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - *pa4 = aa4; - *pa5 = aa5; - *pa6 = aa6; - *pa7 = aa7; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - *(a + 4)= aa4; - *(a + 5)= aa5; - *(a + 6)= aa6; - *(a + 7)= aa7; - a += 8; - - pb = b; - pc0 = c + j * 8; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - pc4 = pc3 + 1; - pc5 = pc4 + 1; - pc6 = pc5 + 1; - pc7 = pc6 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); - vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); - vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); - vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); - vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); - vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); - vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); - VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); - VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); - VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - pc4++; - pc5++; - pc6++; - pc7++; - } - } - pc += (m/8)*8; - - if (m & 4) - { - pa0 = pc; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - a += 4; - - pb = b; - pc0 = pc - i * ldc; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - pc += 4; - } - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = pc - i * ldc; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = 0; k < i; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); } + pcj += vl; } b -= n; a -= 2 * m; @@ -587,92 +127,65 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - - int i, j, k; - - a += (n - 1) * m; - b += (n - 1) * n; + FLOAT bb1, bb2; - for (i = n - 1; i >= 0; i--) { + FLOAT *pci, *pcj; - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - for (k = 0; k < i; k ++){ - *(c + j + k * ldc) -= aa * *(b + k); - } - - } - b -= n; - a -= 2 * m; - } - -} - -#endif - -#else - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; - - ldc *= 2; + int i, j, k; - a += (n - 1) * m * 2; - b += (n - 1) * n * 2; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; - for (i = n - 1; i >= 0; i--) { + size_t vl; - bb1 = *(b + i * 2 + 0); - bb2 = *(b + i * 2 + 1); + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; - for (j = 0; j < m; j ++) { + for (i = n - 1; i >= 0; i--) { - aa1 = *(c + j * 2 + 0 + i * ldc); - aa2 = *(c + j * 2 + 1 + i * ldc); + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + VLSEG2_FLOAT(&va1, &va2, pci, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = - aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif + VSSEG2_FLOAT(a, vs1, vs2, vl); + VSSEG2_FLOAT(pci, vs1, vs2, vl); + a += vl * 2; + pci += vl * 2; - *(a + 0) = cc1; - *(a + 1) = cc2; - - *(c + j * 2 + 0 + i * ldc) = cc1; - *(c + j * 2 + 1 + i * ldc) = cc2; - a += 2; - - for (k = 0; k < i; k ++){ + for (k = 0; k < i; k ++){ + VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); #ifndef CONJ - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #else - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - } - + VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + } + pcj += vl * 2; + } + b -= n * 2; + a -= 4 * m; } - b -= n * 2; - a -= 4 * m; - } - } #endif @@ -689,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug kk = n - offset; c += n * ldc * COMPSIZE; diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c new file mode 100644 index 0000000000..389ee5d57c --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +// Optimizes the implementation in ../generic/zgemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + + FLOAT *boffset; + + FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42; + size_t vl; + + aoffset = a; + boffset = b; + lda *= 2; + + for (j = (n >> 2); j > 0; j--) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + VLSEG2_FLOAT(&v31, &v32, aoffset3, vl); + VLSEG2_FLOAT(&v41, &v42, aoffset4, vl); + + VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + aoffset3 += vl * 2; + aoffset4 += vl * 2; + boffset += vl * 8; + } + } + + if (n & 2) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + + VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + boffset += vl * 4; + } + } + + if (n & 1) { + aoffset1 = aoffset; + aoffset += lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + + VSSEG2_FLOAT(boffset, v11, v12, vl); + + aoffset1 += vl * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c new file mode 100644 index 0000000000..df039bab60 --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0, v1; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda * 2; + + for(i = m; i > 0; i--) { + VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, v0, v1, vl); + + a_offset1 += 2; + b_offset += vl * 2; + } + } + return 0; +} + diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c new file mode 100644 index 0000000000..1b34039c8f --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + size_t vl; + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 2); i > 0; i--) { + vl = 4; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 4; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 1) { + vl = 4; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 8; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 2); i > 0; i--) { + vl = 2; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 2; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 1) { + vl = 2; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + //aoffset1 += 2; + boffset3 += 4; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 2); i > 0; i--) { + vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 2) { + vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 1) { + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c new file mode 100644 index 0000000000..7622fb8104 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0, v1; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl * 2; + + for(i = m; i > 0; i--) { + VLSEG2_FLOAT(&v0, &v1, aoffset1, vl); + VSSEG2_FLOAT(boffset, v0, v1, vl); + + aoffset1 += lda * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c new file mode 100644 index 0000000000..50e29222f0 --- /dev/null +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -0,0 +1,475 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + VLSEG2_FLOAT(&va2, &va3, C1, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + VLSEG2_FLOAT(&va0, &va1, C2, vl); + VLSEG2_FLOAT(&va2, &va3, C3, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + VSSEG2_FLOAT(C2, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + VSSEG2_FLOAT(C3, va2, va3, vl); + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + VLSEG2_FLOAT(&va2, &va3, C1, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + C0 += vl * 2; + C1 += vl * 2; + } + + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + C0 += vl * 2; + } + + bb += bk << 1; + C += ldc << 1; + } + return 0; +} + diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c new file mode 100644 index 0000000000..cf466d3fa8 --- /dev/null +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + posX * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); + VLSEG2_FLOAT(&va10, &va11, ao1, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c new file mode 100644 index 0000000000..6209f54172 --- /dev/null +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -0,0 +1,120 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + VBOOL_T vbool_gt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + posX * lda * 2; + ao2 = a + posX * 2 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); + VLSEG2_FLOAT(&va20, &va21, ao2, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c new file mode 100644 index 0000000000..df5c916a57 --- /dev/null +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + (posX) * lda * 2; + + for (i = m; i > 0; i--, offset--) { + + VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); + VLSEG2_FLOAT(&va10, &va11, ao1, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c new file mode 100644 index 0000000000..dcf2b081ae --- /dev/null +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + (posX + 0) * lda * 2; + ao2 = a + posX * 2 + 0 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); + VLSEG2_FLOAT(&va20, &va21, ao2, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c new file mode 100644 index 0000000000..afd6944086 --- /dev/null +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -0,0 +1,145 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT vid_v_i32m2 +#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T va0, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += 2; + b += vl * 2; + + X ++; + i ++; + } + else if (X < posY) + { + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c new file mode 100644 index 0000000000..c7d5939495 --- /dev/null +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -0,0 +1,143 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T va0, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X < posY) + { + //va1 = VLEV_FLOAT(ao, vl); + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + //va1 = VLEV_FLOAT(ao, vl); + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + //VSEV_FLOAT(b, vb, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c new file mode 100644 index 0000000000..3c70b63853 --- /dev/null +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -0,0 +1,144 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + FLOAT *ao; + + FLOAT_V_T va0, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += 2; + b += vl * 2; + + X++; + i++; + } + else if (X > posY) + { + ao += lda * 2; + b += vl * 2; + + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c new file mode 100644 index 0000000000..706782cf0b --- /dev/null +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -0,0 +1,140 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T va0, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X > posY) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c new file mode 100644 index 0000000000..27409ec25e --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -0,0 +1,574 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + BLASLONG off, temp; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*4*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + va0 = VFMULVF_FLOAT(vres4, alphar, vl); + va1 = VFMULVF_FLOAT(vres5, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + VSSEG2_FLOAT(C2, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres6, alphar, vl); + va3 = VFMULVF_FLOAT(vres7, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + VSSEG2_FLOAT(C3, va2, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*4*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + C1 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + ptrbb += 2; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + bb += bk << 1; + C += ldc << 1; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c new file mode 100644 index 0000000000..b7ccb1eb3e --- /dev/null +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c new file mode 100644 index 0000000000..911b81de58 --- /dev/null +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + b += vl * 2; + ao += lda * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c new file mode 100644 index 0000000000..db075c29ba --- /dev/null +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c new file mode 100644 index 0000000000..e121c62739 --- /dev/null +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va0, va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + ao += lda * 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + + return 0; +} diff --git a/param.h b/param.h index 62b675d6ce..236f500750 100644 --- a/param.h +++ b/param.h @@ -3055,11 +3055,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 #define DGEMM_DEFAULT_UNROLL_MN 32 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 160 #define DGEMM_DEFAULT_P 160 From 240695862984d4de845f1c42821a883946932df7 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 24 Feb 2023 10:44:55 +0000 Subject: [PATCH 09/36] * update intrinsics to match latest spec at https://github.com/riscv-non-isa/rvv-intrinsic-doc (in particular, __riscv_ prefixes for rvv intrinsics) * fix multiple numerical stability and corner case issues * add a script to generate arbitrary gemm kernel shapes * add a generic zvl256b target to demonstrate large gemm kernel unrolls --- common_riscv64.h | 15 +- cpuid_riscv64.c | 10 +- kernel/generic/trmmkernel_16x8.c | 3676 ++++++++++++++++++++ kernel/generic/zlaswp_ncopy_8.c | 1051 ++++++ kernel/riscv64/KERNEL.RISCV64_ZVL256B | 199 ++ kernel/riscv64/amax_vector.c | 231 +- kernel/riscv64/amin_vector.c | 252 +- kernel/riscv64/asum_vector.c | 99 +- kernel/riscv64/axpby_vector.c | 47 +- kernel/riscv64/axpy_vector.c | 42 +- kernel/riscv64/cgemm_kernel_8x8_zvl256b.c | 1931 ++++++++++ kernel/riscv64/copy_vector.c | 39 +- kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c | 2007 +++++++++++ kernel/riscv64/dgemm_kernel_8x8_zvl256b.c | 860 +++++ kernel/riscv64/dot.c | 2 +- kernel/riscv64/dot_vector.c | 86 +- kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c | 1068 ++++++ kernel/riscv64/gemv_n_vector.c | 24 +- kernel/riscv64/gemv_t_vector.c | 91 +- kernel/riscv64/generate_kernel.py | 670 ++++ kernel/riscv64/iamax_vector.c | 180 +- kernel/riscv64/iamin_vector.c | 160 +- kernel/riscv64/imax_vector.c | 124 +- kernel/riscv64/imin_vector.c | 185 +- kernel/riscv64/izamax_vector.c | 277 +- kernel/riscv64/izamin_vector.c | 275 +- kernel/riscv64/max_vector.c | 77 +- kernel/riscv64/min_vector.c | 77 +- kernel/riscv64/nrm2_vector.c | 342 +- kernel/riscv64/nrm2_vector_dot.c | 8 +- kernel/riscv64/rot_vector.c | 42 +- kernel/riscv64/scal_vector.c | 83 +- kernel/riscv64/sgemm_kernel_16x8_zvl256b.c | 1081 ++++++ kernel/riscv64/strmm_kernel_16x8_zvl256b.c | 1330 +++++++ kernel/riscv64/sum_vector.c | 114 + kernel/riscv64/swap_vector.c | 54 +- kernel/riscv64/symv_L_vector.c | 82 +- kernel/riscv64/symv_U_vector.c | 86 +- kernel/riscv64/zamax_vector.c | 90 +- kernel/riscv64/zamin_vector.c | 89 +- kernel/riscv64/zasum_vector.c | 107 +- kernel/riscv64/zaxpby_vector.c | 32 +- kernel/riscv64/zaxpy_vector.c | 20 +- kernel/riscv64/zcopy_vector.c | 12 +- kernel/riscv64/zdot_vector.c | 60 +- kernel/riscv64/zgemm_kernel_8x4_zvl256b.c | 1253 +++++++ kernel/riscv64/zgemm_kernel_generic.c | 140 + kernel/riscv64/zgemv_n_vector.c | 28 +- kernel/riscv64/zgemv_t_vector.c | 88 +- kernel/riscv64/zhemv_LM_vector.c | 60 +- kernel/riscv64/zhemv_UV_vector.c | 60 +- kernel/riscv64/znrm2_vector.c | 365 +- kernel/riscv64/zrot_vector.c | 38 +- kernel/riscv64/zscal_vector.c | 32 +- kernel/riscv64/zsum_vector.c | 131 + kernel/riscv64/zswap_vector.c | 50 +- kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c | 1337 +++++++ param.h | 39 + 58 files changed, 18634 insertions(+), 2374 deletions(-) create mode 100644 kernel/generic/trmmkernel_16x8.c create mode 100644 kernel/generic/zlaswp_ncopy_8.c create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL256B create mode 100644 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c create mode 100755 kernel/riscv64/generate_kernel.py create mode 100644 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c create mode 100644 kernel/riscv64/strmm_kernel_16x8_zvl256b.c create mode 100644 kernel/riscv64/sum_vector.c create mode 100644 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c create mode 100644 kernel/riscv64/zgemm_kernel_generic.c create mode 100644 kernel/riscv64/zsum_vector.c create mode 100644 kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c diff --git a/common_riscv64.h b/common_riscv64.h index 2092bd5abc..de79c8cabb 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,12 +91,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) -#include -#endif - -#if defined(x280) -#include +#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v) +# include +# if !defined(DOUBLE) +# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v) +# else +# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v) +# endif +#else +# define EXTRACT_FLOAT(v) (v[0]) #endif #endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 5326787e6b..1b6b62f212 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -70,14 +70,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_GENERIC 0 -#define CPU_C910V 1 -#define CPU_x280 2 +#define CPU_GENERIC 0 +#define CPU_C910V 1 +#define CPU_RISCV64_ZVL256B 2 static char *cpuname[] = { "RISCV64_GENERIC", - "C910V" - "x280" + "C910V", + "CPU_RISCV64_ZVL256B" }; int detect(void){ diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c new file mode 100644 index 0000000000..5412eab70f --- /dev/null +++ b/kernel/generic/trmmkernel_16x8.c @@ -0,0 +1,3676 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + FLOAT res4_4; + FLOAT res4_5; + FLOAT res4_6; + FLOAT res4_7; + + FLOAT res4_8; + FLOAT res4_9; + FLOAT res4_10; + FLOAT res4_11; + FLOAT res4_12; + FLOAT res4_13; + FLOAT res4_14; + FLOAT res4_15; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + FLOAT res5_4; + FLOAT res5_5; + FLOAT res5_6; + FLOAT res5_7; + + FLOAT res5_8; + FLOAT res5_9; + FLOAT res5_10; + FLOAT res5_11; + FLOAT res5_12; + FLOAT res5_13; + FLOAT res5_14; + FLOAT res5_15; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + FLOAT res6_4; + FLOAT res6_5; + FLOAT res6_6; + FLOAT res6_7; + + FLOAT res6_8; + FLOAT res6_9; + FLOAT res6_10; + FLOAT res6_11; + FLOAT res6_12; + FLOAT res6_13; + FLOAT res6_14; + FLOAT res6_15; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + FLOAT res7_4; + FLOAT res7_5; + FLOAT res7_6; + FLOAT res7_7; + + FLOAT res7_8; + FLOAT res7_9; + FLOAT res7_10; + FLOAT res7_11; + FLOAT res7_12; + FLOAT res7_13; + FLOAT res7_14; + FLOAT res7_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, A3, A4, A5, A6, A7, A8; + FLOAT B1, B2, B3, B4, B5, B6, B7, B8; + + FLOAT A9, A10, A11, A12, A13, A14, A15, A16; + FLOAT B9, B10, B11, B12, B13, B14, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + for( int pass = 0; pass < 2; ++pass ) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + b1 += 4*lda; + b2 += 4*lda; + b3 += 4*lda; + b4 += 4*lda; + b5 += 4*lda; + b6 += 4*lda; + b7 += 4*lda; + b8 += 4*lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + buffer += 16; + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + + if (n & 4) { + { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + + buffer += 16; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + } + } //if (n & 4) + + if (n & 2) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else { + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else { + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + // buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B new file mode 100644 index 0000000000..d8690682f4 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -0,0 +1,199 @@ +SAMAXKERNEL = amax_vector.c +DAMAXKERNEL = amax_vector.c +CAMAXKERNEL = zamax_vector.c +ZAMAXKERNEL = zamax_vector.c + +SAMINKERNEL = amin_vector.c +DAMINKERNEL = amin_vector.c +CAMINKERNEL = zamin_vector.c +ZAMINKERNEL = zamin_vector.c + +SMAXKERNEL = max_vector.c +DMAXKERNEL = max_vector.c + +SMINKERNEL = min_vector.c +DMINKERNEL = min_vector.c + +ISAMAXKERNEL = iamax_vector.c +IDAMAXKERNEL = iamax_vector.c +ICAMAXKERNEL = izamax_vector.c +IZAMAXKERNEL = izamax_vector.c + +ISAMINKERNEL = iamin_vector.c +IDAMINKERNEL = iamin_vector.c +ICAMINKERNEL = izamin_vector.c +IZAMINKERNEL = izamin_vector.c + +ISMAXKERNEL = imax_vector.c +IDMAXKERNEL = imax_vector.c + +ISMINKERNEL = imin_vector.c +IDMINKERNEL = imin_vector.c + +SASUMKERNEL = asum_vector.c +DASUMKERNEL = asum_vector.c +CASUMKERNEL = zasum_vector.c +ZASUMKERNEL = zasum_vector.c + +SSUMKERNEL = sum_vector.c +DSUMKERNEL = sum_vector.c +CSUMKERNEL = zsum_vector.c +ZSUMKERNEL = zsum_vector.c + +SAXPYKERNEL = axpy_vector.c +DAXPYKERNEL = axpy_vector.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c + +SCOPYKERNEL = copy_vector.c +DCOPYKERNEL = copy_vector.c +CCOPYKERNEL = zcopy_vector.c +ZCOPYKERNEL = zcopy_vector.c + +SDOTKERNEL = dot_vector.c +DDOTKERNEL = dot_vector.c +CDOTKERNEL = zdot_vector.c +ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = nrm2_vector.c +DNRM2KERNEL = nrm2_vector.c +CNRM2KERNEL = znrm2_vector.c +ZNRM2KERNEL = znrm2_vector.c + +SROTKERNEL = rot_vector.c +DROTKERNEL = rot_vector.c +CROTKERNEL = zrot_vector.c +ZROTKERNEL = zrot_vector.c + +SSCALKERNEL = scal_vector.c +DSCALKERNEL = scal_vector.c +CSCALKERNEL = zscal_vector.c +ZSCALKERNEL = zscal_vector.c + +SSWAPKERNEL = swap_vector.c +DSWAPKERNEL = swap_vector.c +CSWAPKERNEL = zswap_vector.c +ZSWAPKERNEL = zswap_vector.c + +SGEMVNKERNEL = gemv_n_vector.c +DGEMVNKERNEL = gemv_n_vector.c +CGEMVNKERNEL = zgemv_n_vector.c +ZGEMVNKERNEL = zgemv_n_vector.c + +SGEMVTKERNEL = gemv_t_vector.c +DGEMVTKERNEL = gemv_t_vector.c +CGEMVTKERNEL = zgemv_t_vector.c +ZGEMVTKERNEL = zgemv_t_vector.c + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_vector.c +SSYMV_L_KERNEL = symv_L_vector.c +DSYMV_U_KERNEL = symv_U_vector.c +DSYMV_L_KERNEL = symv_L_vector.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + +CHEMV_L_KERNEL = zhemv_LM_vector.c +CHEMV_M_KERNEL = zhemv_LM_vector.c +CHEMV_U_KERNEL = zhemv_UV_vector.c +CHEMV_V_KERNEL = zhemv_UV_vector.c +ZHEMV_L_KERNEL = zhemv_LM_vector.c +ZHEMV_M_KERNEL = zhemv_LM_vector.c +ZHEMV_U_KERNEL = zhemv_UV_vector.c +ZHEMV_V_KERNEL = zhemv_UV_vector.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index 1b77993400..81a39af329 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -28,36 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -65,103 +66,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_zero; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_zero = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -169,94 +95,27 @@ asm volatile( BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index f9b7defaea..c8ba75f4a5 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -26,232 +26,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include -#include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - if (n <= 0 || inc_x <= 0) return(0.0); - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT minf=0.0; + if (n <= 0 || inc_x <= 0) return(minf); + + minf = *x; + x += inc_x; + --n; + if (n == 0) return(minf); + unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(minf, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -64,75 +67,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ gvl = VSETVL(n); - v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -93,13 +91,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_y == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -119,14 +116,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_x == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -146,15 +141,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else{ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -174,9 +167,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } } return(dot); diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c new file mode 100644 index 0000000000..b1739f2488 --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c @@ -0,0 +1,1068 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='dtrmm_kernel_8x8_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(LEFT) != defined(TRANSA) + #define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j: + tmp_unroll_j = int(tmp_unroll_j / 2) + + if tmp_unroll_i < a_regs or tmp_unroll_j < N: + dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i) + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + if ti == 0 and tj==0: + dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + else: + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;") + dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;") + + with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): + for i in range(N): + dest.write("B{i}r = B[bi+{i}*2+0];", i=i) + dest.write("B{i}i = B[bi+{i}*2+1];", i=i) + dest.write("bi += {N}*2;") + dest.write() + + for i in range(a_regs): + dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) + + dest.write("ai += {M}*2;") + dest.write() + + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign + # so accumulating them directly to ACC would lose precision when ACC is larger + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);") + dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);") + + dest.write() + dest.write("{index_type} ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + if TRMM: + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );") + dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );") + else: + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );") + dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );") + dest.write("{increment}") + + if not TRMM: + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );") + + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );") + + if not TRMM: + dest.write() + dest.write("ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);") + dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);") + dest.write("{increment}") + +#----------------------------------------------------------------------- +def generate_gemm_kernel( settings, OUTPUT ): + if settings['conjugate'].value: + ERROR('conjugate gemm not yet supported') + + is_complex = settings['complex'].value + generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real + dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings }) + + M = settings['M'].value + N = settings['N'].value + vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + a_regs = max(int(M/vlenmax), 1) + + accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + required_regs = accumulation_regs + a_regs + if is_complex: + required_regs = required_regs * 2 + 2 + dest.write(''' +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfmacc{tail_policy} +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfmsac{tail_policy} +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfnmsac{tail_policy} +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfnmacc{tail_policy} +#endif +'''.format(tail_policy=settings['tail_policy'].value)) + + + if required_regs > 32: + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '') + )) + + TRMM = (settings['op'].value == 'trmm') + if TRMM: + with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"): + dest.write("#define BACKWARDS") + + dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})", + alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'), + trmm = (', BLASLONG offset' if TRMM else '') + ) + + with dest.block("{{", "}}", elt_size='*2' if is_complex else ''): + if settings['trace'].value: + dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);") + dest.write("{index_type} gvl = 0;") + dest.write("{index_type} m_top = 0;") + dest.write("{index_type} n_top = 0;") + + dest.write() + dest.write() + dest.write("// -- MAIN PASS") + + with dest.block("for ({index_type} j=0; j 0 ): + with dest.map(N=N_tail): + dest.write() + dest.write() + dest.write("// -- tails for N={N}") + with dest.block("if( N & {N} ) {{", "}}" ): + if settings['trace'].value: + dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) + dest.write("m_top = 0;") + with dest.block("for ({index_type} i=0; i M_tail_min ): + with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + a_regs = max( 1, int(M_tail/vlenmax) ) + vlen = int(M_tail/a_regs) + dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen) + + generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs ) + dest.write( "m_top += {M_tail};" ) + + M_tail = int( M_tail / 2 ) + + while( M_tail > 0 ): + with dest.block("if( M & {M_tail} ) {{", "}}", + M_tail=M_tail, + N=N, + result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) + ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + for r in range(M_tail * N * (2 if is_complex else 1)): + dest.write("{result_t} result{r} = 0;", + r=r + ) + + dest.write("{index_type} ai=m_top*K{elt_size};") + dest.write("{index_type} bi=n_top*K{elt_size};") + + if TRMM: + with dest.map(M=M_tail, N=N): + generate_trmm_block( dest ) + + with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ): + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2 + )) + dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2 + )) + else: + dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format( + dest=ki*M_tail+kj, kj=kj, ki=ki + )) + dest.write("ai+={M_tail}{elt_size};") + dest.write("bi+={N}{elt_size};") + + dest.write("{index_type} ci=n_top*ldc+m_top;") + if is_complex: + dest.write("{result_t} Cr, Ci;") + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + if TRMM: + dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + else: + dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj) + dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj) + dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1) + dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0) + dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj ) + dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj ) + else: + op = '' if TRMM else '+' + dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};", + ki=ki, kj=kj, op=op, dest=ki*M_tail+kj + ) + dest.write("m_top+={M_tail};") + + M_tail = int(M_tail/2) + + +#----------------------------------------------------------------------- +class Setting(object): + def __init__( self, value, convert = None ): + self._value = value + self._convert = convert + + @classmethod + def ENUM( cls, *values ): + def closure( values ): + return lambda value: values[value.lower()] + return closure( { v.lower():v for v in values } ) + + @classmethod + def BOOL( cls, value ): + return value.lower().startswith('t') or value == '1' + + @property + def value( self ): + return self._value + + @property + def configurable( self ): + return self._convert is not None + + @value.setter + def value( self, value ): + self._value = self._convert( value ) + + def __str__( self ): + return str(self._value) + +#----------------------------------------------------------------------- +def main(): + settings = { + 'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ), + 'M': Setting( 16, int ), + 'N': Setting( 4, int ), + 'reg_width_bits': Setting( 256, int ), + 'LMUL': Setting( 1, int ), + 'M_tail_scalar_from':Setting( 2, int ), + 'cpu': Setting( 'zvl256b', str ), + 'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ), + 'force_acc_double': Setting( False, Setting.BOOL ), + 'complex': Setting( False, Setting.BOOL ), + 'conjugate': Setting( False, Setting.BOOL ), + 'index_type': Setting( 'BLASLONG', str ), + 'trace': Setting( False, Setting.BOOL ), + 'output': Setting( None, str ), + 'tail_policy': Setting( '', str ), # _ta, if toolchain supports it + '__riscv_': Setting( '__riscv_', str), + } + + for item in sys.argv[1:]: + try: + name, value = tuple(item.split( '=', 1 )) + except: + ERROR("couldn't parse {}, expected arguments of the form name=value".format(item)) + + if name not in settings: + ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name ) + +"options (and current defaults) are\n{}".format( + " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()])) + ) + + try: + settings[name].value = value + except: + import traceback + traceback.print_exc() + ERROR("couldn't parse {}".format(item)) + + if settings['output'].value is None: + if settings['complex'].value: + prefix = 'z' if settings['param_precision'].value == 'double' else 'c' + else: + prefix = 'd' if settings['param_precision'].value == 'double' else 's' + settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format( + prefix, + settings['op'], + settings['M'], + settings['N'], + settings['cpu'] + )) + + if settings['param_precision'].value == 'double': + settings['param_scalar_t'] = Setting( 'double' ) + settings['ELEN_PARAM'] = Setting(64) + else: + settings['param_scalar_t'] = Setting( 'float' ) + settings['ELEN_PARAM'] = Setting(32) + + settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + + settings['ELEN_ACC'] = settings['ELEN_PARAM'] + settings['LMUL_ACC'] = Setting(settings['LMUL'].value) + widen = '' + + if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'): + settings['ELEN_ACC'] = Setting(64) + settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2) + settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy'])) + widen = 'w' + + settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + + settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL'])) + settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC'])) + settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'])) + + + to_stdout = (settings['output'].value == '-') + if not to_stdout: + print("Writing {}".format(settings['output'].value), file=sys.stderr) + + with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file: + def OUTPUT(*args, **kwargs): + print(*args, file=destination_file, **kwargs) + + OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable]))) + OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable]))) + + OUTPUT('#include "common.h"') + OUTPUT("\n") + + if settings['op'].value in ('gemm', 'trmm'): + generate_gemm_kernel(settings, OUTPUT) + else: + ERROR("unsupported kernel type {}".format(settings['op'])) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index 9fea522f7f..92880fbcfc 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -27,118 +27,111 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFABS_FLOAT __riscv_vfabs_v_f64m4 +#define VCOMPRESS __riscv_vcompress_vm_u64m4 +#define VMV_X __riscv_vmv_x_s_u64m4_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFABS_FLOAT __riscv_vfabs_v_f32m4 +#define VCOMPRESS __riscv_vcompress_vm_u32m4 +#define VMV_X __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); + + gvl = VSETVL(n); + UINT_V_T vid = VIDV_UINT(gvl); if(inc_x == 1){ - gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLEV_FLOAT(&x[j], gvl); + v_max = VFABS_FLOAT(v_max, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -146,51 +139,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 4e81e78484..0503f9948b 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -31,85 +31,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -117,29 +111,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLEV_FLOAT(&x[j], gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -151,12 +145,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -165,33 +157,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index ca48a3c48e..e24f9fd485 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -31,68 +31,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); - FLOAT maxf=-FLT_MAX; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -104,32 +102,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -145,37 +145,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index 2a677098d3..a60bd3d07a 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -31,122 +31,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -159,59 +142,39 @@ asm volatile( for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 66a101566f..89cd510c19 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -27,241 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VSEVU_UINT vse64_v_u64m8 +#define VSEVU_UINT __riscv_vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT __riscv_vse32_v_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; - FLOAT_V_T vx0, vx1, v_max; + FLOAT_V_T vx, vx2, v_max; UINT_V_T v_max_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); + //index where element greater than v_max - mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - vx0 = VFMVVF_FLOAT(0, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); - mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_max_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = VFMVFS_FLOAT(v_res); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_max = VFADDVV_FLOAT(v_max, vx2, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); + if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); - mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } - return(max_index+1); -} - + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 818193a9e0..74daf32b85 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,235 +31,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t #define VSEVU_UINT vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT __riscv_vse32_v_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; - FLOAT_V_T vx0, vx1, v_min; + FLOAT_V_T vx, vx2, v_min; UINT_V_T v_min_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; - v_min_index = VMVVX_UINT(0, gvl); + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; + v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); - //index where element less than v_min - mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); - mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_min_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = VFMVFS_FLOAT(v_res); - if(cur_minf < minf){ + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_min = VFADDVV_FLOAT(v_min, vx2, gvl); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); - mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; - + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } - return(min_index+1); -} - + return(min_index+1); +} diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 7f31e9a530..97f602e515 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_max = VFMAXVV_FLOAT(v_max, v1, gvl); j += gvl * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 14b7e01ed1..77bf19b9de 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, v1, gvl); j += gvl * 2; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); } for(;j= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + + if(inc_x == 1){ + gvl = VSETVL(n); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; if (n < 0) return(0); + + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } + if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 - +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 - +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) +#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -70,10 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -84,9 +87,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); mask0 = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); + v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); mask1 = VMFLTVF_FLOAT(v1, 0, gvl); - v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); + v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); v0 = VFADDVV_FLOAT(v0, v1, gvl); v_max = VFMAXVV_FLOAT(v_max, v0, gvl); @@ -94,22 +97,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; ix += inc_xv; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); if(j maxf) - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index d9eca7f102..095b1c3dfc 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -29,38 +29,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) +#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -69,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -83,9 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); mask0 = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); + v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); mask1 = VMFLTVF_FLOAT(v1, 0, gvl); - v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); + v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); v0 = VFADDVV_FLOAT(v0, v1, gvl); v_min = VFMINVV_FLOAT(v_min, v0, gvl); @@ -93,21 +99,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; ix += inc_xv; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); if(j -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -67,12 +73,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; gvl = VSETVL(n2); @@ -81,26 +84,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i N ) + n_packing >>= 1; + + BLASLONG m_packing = UNROLL_M; + BLASLONG m_top = 0; + while (m_top < M) + { + while( m_top+m_packing > M ) + m_packing >>= 1; + + BLASLONG ai = K*m_top*2; + BLASLONG bi = K*n_top*2; + + BLASLONG pass_K = K; + + + #ifdef TRMMKERNEL + #ifdef LEFT + BLASLONG off = offset + m_top; + #else + BLASLONG off = -offset + n_top; + #endif + #ifdef BACKWARDS + ai += off * m_packing*2; + bi += off * n_packing*2; + pass_K -= off; + #else + #ifdef LEFT + pass_K = off + m_packing; + #else + pass_K = off + n_packing; + #endif + #endif + #endif + + memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) ); + + for (BLASLONG k=0; k 0 ){ // scale change? + // find largest element in v0 and v1 + v_res = VFREDMAX( v0, v_z0, gvl ); + v_res = VFREDMAX( v1, v_res, gvl ); + FLOAT const largest_elt = EXTRACT_FLOAT( v_res ); + + v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt + v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt) + v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt) + + v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale } - //ssq in vector vr: vr[0] - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - //total ssq now - ssq += VFMVFS_FLOAT(v_res); - //tail - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); - //if scale change - mask = VMFGTVF_FLOAT(v0, scale, gvl); - index = VMFIRSTM(mask, gvl); - if(index == -1){//no elements greater than scale - if(scale != 0.0){ - v0 = VFDIVVF_FLOAT(v0, scale, gvl); - vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); + MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl ); + MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl ); + v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl ); + v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl ); + v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl ); + v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl ); + + idx += inc_x * gvl * 2; + } + + v_res = VFREDUSUM(v_ssq, v_z0, gvl); + FLOAT ssq = EXTRACT_FLOAT(v_res); + FLOAT scale = EXTRACT_FLOAT0_V(v_scale); + + //finish any tail using scalar ops + i*=gvl; + if(i + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); + + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 64 +# else +# define ELEN 32 +# define MLEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 16 +# else +# define ELEN 32 +# define MLEN 8 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i Date: Wed, 1 Mar 2023 17:40:42 +0000 Subject: [PATCH 10/36] factoring riscv64/dot.c fix into separate PR as requested --- kernel/riscv64/dot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index bf55998ca9..46a84ad189 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; - if ( n < 1 ) return(dot); + if ( n < 0 ) return(dot); while(i < n) { From 1374a2d08b078451dcfaf723614ea13e441e1d06 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Sun, 19 Mar 2023 23:59:03 -0700 Subject: [PATCH 11/36] This PR adapts latest spec changes Add prefix (_riscv) for all riscv intrinsics Update some intrinsics' parameter, like vfredxxxx, vmerge --- kernel/riscv64/amax_rvv.c | 54 +++++------ kernel/riscv64/amin_rvv.c | 54 +++++------ kernel/riscv64/asum_rvv.c | 54 +++++------ kernel/riscv64/axpby_rvv.c | 36 +++---- kernel/riscv64/axpy_rvv.c | 28 +++--- kernel/riscv64/copy_rvv.c | 24 ++--- kernel/riscv64/dot_rvv.c | 66 ++++++------- kernel/riscv64/gemm_beta_rvv.c | 24 ++--- kernel/riscv64/gemm_ncopy_8_rvv.c | 28 +++--- kernel/riscv64/gemm_ncopy_rvv_v1.c | 20 ++-- kernel/riscv64/gemm_tcopy_8_rvv.c | 44 ++++----- kernel/riscv64/gemm_tcopy_rvv_v1.c | 16 ++-- kernel/riscv64/gemmkernel_rvv_v1x8.c | 24 ++--- kernel/riscv64/gemv_n_rvv.c | 28 +++--- kernel/riscv64/gemv_t_rvv.c | 53 +++++------ kernel/riscv64/iamax_rvv.c | 115 +++++++++++----------- kernel/riscv64/iamin_rvv.c | 117 +++++++++++------------ kernel/riscv64/imax_rvv.c | 113 +++++++++++----------- kernel/riscv64/imin_rvv.c | 113 +++++++++++----------- kernel/riscv64/izamax_rvv.c | 127 ++++++++++++------------- kernel/riscv64/izamin_rvv.c | 121 ++++++++++++----------- kernel/riscv64/max_rvv.c | 50 +++++----- kernel/riscv64/min_rvv.c | 50 +++++----- kernel/riscv64/nrm2_rvv.c | 46 ++++----- kernel/riscv64/rot_rvv.c | 36 +++---- kernel/riscv64/scal_rvv.c | 32 +++---- kernel/riscv64/sum_rvv.c | 50 +++++----- kernel/riscv64/swap_rvv.c | 28 +++--- kernel/riscv64/symm_lcopy_rvv_v1.c | 50 +++++----- kernel/riscv64/symm_ucopy_rvv_v1.c | 50 +++++----- kernel/riscv64/symv_L_rvv.c | 81 ++++++++-------- kernel/riscv64/symv_U_rvv.c | 81 ++++++++-------- kernel/riscv64/trmm_lncopy_rvv_v1.c | 48 +++++----- kernel/riscv64/trmm_ltcopy_rvv_v1.c | 44 ++++----- kernel/riscv64/trmm_uncopy_rvv_v1.c | 48 +++++----- kernel/riscv64/trmm_utcopy_rvv_v1.c | 44 ++++----- kernel/riscv64/trmmkernel_rvv_v1x8.c | 28 +++--- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 42 ++++---- kernel/riscv64/trsm_lncopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_ltcopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_uncopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_utcopy_rvv_v1.c | 40 ++++---- kernel/riscv64/zamax_rvv.c | 58 +++++------ kernel/riscv64/zamin_rvv.c | 58 +++++------ kernel/riscv64/zasum_rvv.c | 51 +++++----- kernel/riscv64/zaxpby_rvv.c | 52 +++++----- kernel/riscv64/zaxpy_rvv.c | 32 +++---- kernel/riscv64/zcopy_rvv.c | 44 ++++----- kernel/riscv64/zdot_rvv.c | 65 +++++++------ kernel/riscv64/zgemm_beta_rvv.c | 32 +++---- kernel/riscv64/zgemm_ncopy_4_rvv.c | 24 ++--- kernel/riscv64/zgemm_ncopy_rvv_v1.c | 16 ++-- kernel/riscv64/zgemm_tcopy_4_rvv.c | 40 ++++---- kernel/riscv64/zgemm_tcopy_rvv_v1.c | 16 ++-- kernel/riscv64/zgemmkernel_rvv_v1x4.c | 36 +++---- kernel/riscv64/zgemv_n_rvv.c | 48 +++++----- kernel/riscv64/zgemv_t_rvv.c | 61 ++++++------ kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 84 ++++++++-------- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 84 ++++++++-------- kernel/riscv64/znrm2_rvv.c | 66 ++++++------- kernel/riscv64/zrot_rvv.c | 52 +++++----- kernel/riscv64/zscal_rvv.c | 44 ++++----- kernel/riscv64/zsum_rvv.c | 47 +++++---- kernel/riscv64/zswap_rvv.c | 24 ++--- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 61 ++++++------ kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 60 ++++++------ kernel/riscv64/ztrmmkernel_2x2_rvv.c | 60 ++++++------ kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 40 ++++---- kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 36 +++---- 79 files changed, 2013 insertions(+), 2031 deletions(-) diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c index c9c6e7f730..be0bdbea0c 100644 --- a/kernel/riscv64/amax_rvv.c +++ b/kernel/riscv64/amax_rvv.c @@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c index 370b6c3388..d4926084b7 100644 --- a/kernel/riscv64/amin_rvv.c +++ b/kernel/riscv64/amin_rvv.c @@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c index 4f711c9be0..691591e22b 100644 --- a/kernel/riscv64/asum_rvv.c +++ b/kernel/riscv64/asum_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -93,7 +93,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); asumf = VFMVFS_FLOAT_M1(v_res); return(asumf); } diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index 7c35c563d1..a1dbdb0e42 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #endif int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c index 3986f4e212..8bc2f30de7 100644 --- a/kernel/riscv64/axpy_rvv.c +++ b/kernel/riscv64/axpy_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c index 5d5a8bd049..041fd2daeb 100644 --- a/kernel/riscv64/copy_rvv.c +++ b/kernel/riscv64/copy_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c index 60dcac2f57..3276695b63 100644 --- a/kernel/riscv64/dot_rvv.c +++ b/kernel/riscv64/dot_rvv.c @@ -37,24 +37,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( n <= 0 ) return(dot); - size_t vlmax = vsetvlmax_e64m8(); - vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); + size_t vlmax = __riscv_vsetvlmax_e64m8(); + vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax); if(inc_x == 1 && inc_y == 1) { for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vle32_v_f32m4(x, vl); - vfloat32m4_t vy = vle32_v_f32m4(y, vl); + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vle64_v_f64m8(x, vl); - vfloat64m8_t vy = vle64_v_f64m8(y, vl); + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } @@ -63,18 +63,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vle32_v_f32m4(x, vl); - vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vle64_v_f64m8(x, vl); - vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } else if (1 == inc_y) { @@ -82,18 +82,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_x = inc_x * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); - vfloat32m4_t vy = vle32_v_f32m4(y, vl); + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); - vfloat64m8_t vy = vle64_v_f64m8(y, vl); + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } else { @@ -102,25 +102,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); - vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); - vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } - vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); - vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); - dot = vfmv_f_s_f64m1_f64(vec_sum); + vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); + dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); return(dot); } diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c index 34d1ea0780..f3cf6491d5 100644 --- a/kernel/riscv64/gemm_beta_rvv.c +++ b/kernel/riscv64/gemm_beta_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 #endif // Optimizes the implementation in ../generic/gemm_beta.c diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c index 525b223c20..3030d67fbc 100644 --- a/kernel/riscv64/gemm_ncopy_8_rvv.c +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif // Optimizes the implementation in ../generic/gemm_ncopy_8.c diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c index 2c5230752c..2d6db15e55 100644 --- a/kernel/riscv64/gemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 81c1f962bc..080a873123 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEV_FLOAT vlse32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEV_FLOAT vlse64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c index a291b70b81..c5fb6479fb 100644 --- a/kernel/riscv64/gemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c index 5cd509f93a..471b3158fe 100644 --- a/kernel/riscv64/gemmkernel_rvv_v1x8.c +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #endif int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c index 9d2dee6158..1366eb5adf 100644 --- a/kernel/riscv64/gemv_n_rvv.c +++ b/kernel/riscv64/gemv_n_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c index a80af81b63..f0c8348669 100644 --- a/kernel/riscv64/gemv_t_rvv.c +++ b/kernel/riscv64/gemv_t_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -63,7 +63,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va, vx, vr; FLOAT_V_T_M1 v_res, v_z0; size_t vlmax = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL_MAX; @@ -83,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO vr = VFMACCVV_FLOAT(vr, va, vx, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); *y += alpha * VFMVFS_FLOAT_M1(v_res); y += inc_y; a += lda; @@ -107,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO vr = VFMACCVV_FLOAT(vr, va, vx, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); *y += alpha * VFMVFS_FLOAT_M1(v_res); y += inc_y; a += lda; diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index 8b33b3bcbe..ef7850a55a 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -28,57 +28,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -134,16 +134,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_z0; + FLOAT_V_T_M1 v_res; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_z0 = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index 585b371861..56a086fed4 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -29,57 +29,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -135,16 +135,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index d84ad968e2..5b60a56f79 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_min; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index fb734f6f8a..b49544a1bb 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index 9cb332cbb6..e61d0cbec1 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -28,63 +28,63 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define MASK_T vbool16_t -#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 -#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 -#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMAXVV_FLOAT vfmax_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFIRSTM vfirst_m_b16 -#define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT vid_v_u64m4_m -#define VIDV_UINT vid_v_u64m4 -#define VADDVX_MASK_UINT vadd_vx_u64m4_m -#define VADDVX_UINT vadd_vx_u64m4 -#define VMVVX_UINT vmv_v_x_u64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 -#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 #else -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 -#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 -#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMAXVV_FLOAT vfmax_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT vid_v_u32m4_m -#define VIDV_UINT vid_v_u32m4 -#define VADDVX_MASK_UINT vadd_vx_u32m4_m -#define VADDVX_UINT vadd_vx_u32m4 -#define VMVVX_UINT vmv_v_x_u32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 -#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); @@ -138,24 +138,23 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); } } - FLOAT_V_T_M1 v_res, v_z0; + FLOAT_V_T_M1 v_res; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_z0 = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index 69771e5aa5..297b3c99a3 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -29,59 +29,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 -#define MASK_T vbool16_t -#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 -#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 -#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMINVV_FLOAT vfmin_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFIRSTM vfirst_m_b16 -#define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT vid_v_u64m4_m -#define VIDV_UINT vid_v_u64m4 -#define VADDVX_MASK_UINT vadd_vx_u64m4_m -#define VADDVX_UINT vadd_vx_u64m4 -#define VMVVX_UINT vmv_v_x_u64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 -#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 #else -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 -#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 -#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMINVV_FLOAT vfmin_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT vid_v_u32m4_m -#define VIDV_UINT vid_v_u32m4 -#define VADDVX_MASK_UINT vadd_vx_u32m4_m -#define VADDVX_UINT vadd_vx_u32m4 -#define VMVVX_UINT vmv_v_x_u32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 -#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -145,16 +145,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c index 5b1380d2b2..9315321f4c 100644 --- a/kernel/riscv64/max_rvv.c +++ b/kernel/riscv64/max_rvv.c @@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c index bddcc0ba7d..158b682fd1 100644 --- a/kernel/riscv64/min_rvv.c +++ b/kernel/riscv64/min_rvv.c @@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 979c316481..42abfa1196 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -29,30 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #define ABS fabsf #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #define ABS fabs #endif @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c index 7bf5e42703..90f81d5e28 100644 --- a/kernel/riscv64/rot_rvv.c +++ b/kernel/riscv64/rot_rvv.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index d2c0378bfe..2e2cfd31e4 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c index 1db0d09ddf..9715faf224 100644 --- a/kernel/riscv64/sum_rvv.c +++ b/kernel/riscv64/sum_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -89,7 +89,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); sumf = VFMVFS_FLOAT_M1(v_res); return(sumf); } diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c index 2cf92f6ad9..893d705549 100644 --- a/kernel/riscv64/swap_rvv.c +++ b/kernel/riscv64/swap_rvv.c @@ -28,23 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c index f0def96176..a615db44d9 100644 --- a/kernel/riscv64/symm_lcopy_rvv_v1.c +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif // Optimizes the implementation in ../generic/symm_lcopy_4.c @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); VSEV_FLOAT(b, vb, vl); b += vl; diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c index 958506df31..464f97b3a6 100644 --- a/kernel/riscv64/symm_ucopy_rvv_v1.c +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif // Optimizes the implementation in ../generic/symm_ucopy_4.c @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); VSEV_FLOAT(b, vb, vl); b += vl; diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index 737abaae3e..e87ab22ae3 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -28,43 +28,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T_M1 vfloat32m1_t -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T_M1 vfloat64m1_t -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT_V_T_M1 v_res, v_z0; size_t vlmax = VSETVL_MAX_M1, vl; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL_MAX; @@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vr = VFMACCVV_FLOAT(vr, vx, va, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[j] += alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); jy += inc_y; @@ -172,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[j] += alpha * VFMVFS_FLOAT_M1(v_res); jx += inc_x; @@ -211,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); jx += inc_x; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index cb923be5d1..3fbc33c893 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -29,43 +29,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T_M1 vfloat32m1_t -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T_M1 vfloat64m1_t -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT *a_ptr = a; FLOAT_V_T_M1 v_res, v_z0; size_t vl_max = VSETVL_MAX_M1, vl; - v_res = VFMVVF_FLOAT_M1(0, vl_max); v_z0 = VFMVVF_FLOAT_M1(0, vl_max); vl_max = VSETVL_MAX; @@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], vl); vr = VFMACCVV_FLOAT(vr, vx, va, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -171,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -209,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c index 3457ca3e1b..4135a9b621 100644 --- a/kernel/riscv64/trmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c @@ -116,10 +116,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLSEV_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao++; diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c index 2fe8cf79e1..580714fde8 100644 --- a/kernel/riscv64/trmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c @@ -111,10 +111,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLEV_FLOAT(ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao += lda; diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c index b64cd840d0..852ab7f111 100644 --- a/kernel/riscv64/trmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c @@ -114,10 +114,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLSEV_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao++; diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c index b96daae5be..e0b6d362df 100644 --- a/kernel/riscv64/trmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -32,27 +32,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c @@ -113,10 +113,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLEV_FLOAT(ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao += lda; diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c index 97b14650c2..393b24bce0 100644 --- a/kernel/riscv64/trmmkernel_rvv_v1x8.c +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 2cba06b386..886af0c3b7 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index 492a5631fa..ddeef966c0 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 4751ae012f..4c83bbaa3b 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif static FLOAT dm1 = -1.; diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index 93a9e69169..b368eefb99 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -28,28 +28,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c index bacfb2b08e..41c84be258 100644 --- a/kernel/riscv64/trsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c index 0fc7c9f243..003bd34654 100644 --- a/kernel/riscv64/trsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif #ifndef UNIT diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c index ee869a7951..6cca5d49cc 100644 --- a/kernel/riscv64/trsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c index a324b0fa6f..bc058525f9 100644 --- a/kernel/riscv64/trsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index 1917042be4..615b7519c3 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -106,7 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index 3f027383a0..a0d36d46f9 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -105,7 +105,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c index 7876646b32..1d2f0e1fe0 100644 --- a/kernel/riscv64/zasum_rvv.c +++ b/kernel/riscv64/zasum_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -99,9 +99,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); asumf += VFMVFS_FLOAT_M1(v_res); return(asumf); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index 66f52d9d0b..e0da553110 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -33,33 +33,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFMSACVF_FLOAT vfmsac_vf_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFMSACVF_FLOAT vfmsac_vf_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c index 777bcb7287..3f75898e04 100644 --- a/kernel/riscv64/zaxpy_rvv.c +++ b/kernel/riscv64/zaxpy_rvv.c @@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c index 5d8322bbbd..bd94810ce6 100644 --- a/kernel/riscv64/zcopy_rvv.c +++ b/kernel/riscv64/zcopy_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_M8(n) vsetvl_e32m8(n) -#define FLOAT_V_T_M8 vfloat32m8_t -#define VLEV_FLOAT_M8 vle32_v_f32m8 -#define VSEV_FLOAT_M8 vse32_v_f32m8 - -#define VSETVL_M4(n) vsetvl_e32m4(n) -#define FLOAT_V_T_M4 vfloat32m4_t -#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 -#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 +#define VSETVL_M8(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 __riscv_vle32_v_f32m8 +#define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T_M4 vfloat32m4_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL_M8(n) vsetvl_e64m8(n) -#define FLOAT_V_T_M8 vfloat64m8_t -#define VLEV_FLOAT_M8 vle64_v_f64m8 -#define VSEV_FLOAT_M8 vse64_v_f64m8 - -#define VSETVL_M4(n) vsetvl_e64m4(n) -#define FLOAT_V_T_M4 vfloat64m4_t -#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 -#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 +#define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 __riscv_vle64_v_f64m8 +#define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T_M4 vfloat64m4_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index 7eae6f608e..1543c513d0 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMSACVV_FLOAT vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMSACVV_FLOAT vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) @@ -72,7 +72,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; FLOAT_V_T_M1 v_res, v_z0; size_t vlmax_m1 = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); size_t vlmax = VSETVL_MAX; @@ -161,9 +160,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA } } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax); CREAL(result) = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax); CIMAG(result) = VFMVFS_FLOAT_M1(v_res); return(result); diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c index a89752d18e..b94b5f4bf5 100644 --- a/kernel/riscv64/zgemm_beta_rvv.c +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -39,23 +39,23 @@ #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFSUBVV_FLOAT vfsub_vv_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFSUBVV_FLOAT vfsub_vv_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c index 389ee5d57c..d50a4b8d55 100644 --- a/kernel/riscv64/zgemm_ncopy_4_rvv.c +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif // Optimizes the implementation in ../generic/zgemm_ncopy_4.c diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c index df039bab60..1d3b8d3b71 100644 --- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -29,15 +29,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index 1b34039c8f..8c35b5616e 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c index 7622fb8104..7a085269c8 100644 --- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c index 50e29222f0..41399cf79b 100644 --- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c index 2eeb61b453..4a40c30a79 100644 --- a/kernel/riscv64/zgemv_n_rvv.c +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index b682d5cd88..15795cc3a2 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -73,7 +73,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; size_t vlmax = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL(m); @@ -105,9 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) @@ -149,9 +148,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index cf466d3fa8..79b20a6467 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VMSLT_VX_INT vmslt_vx_i32m2_b16 -#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VMSLT_VX_INT vmslt_vx_i64m2_b32 -#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #endif @@ -104,13 +104,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index 6209f54172..a86815275e 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VMSLT_VX_INT vmslt_vx_i32m2_b16 -#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VMSLT_VX_INT vmslt_vx_i64m2_b32 -#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #endif @@ -101,13 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index 921ddb8cbd..5f7873b5a6 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -28,35 +28,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT vfabs_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT vfabs_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif // TODO: Should single precision use the widening MAC, or perhaps all should be double? @@ -85,10 +85,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); vr = VFMACCVV_FLOAT(vr, v1, v1, vl); } @@ -103,16 +103,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); vr = VFMACCVV_FLOAT(vr, v1, v1, vl); } } - v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); scale = VFMVFS_FLOAT_M1(v_max); diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c index 68066a00b4..ee81bfe915 100644 --- a/kernel/riscv64/zrot_rvv.c +++ b/kernel/riscv64/zrot_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 079c36a2df..779fab68c3 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index 3928fbe276..44df112c6b 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -88,9 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); sumf += VFMVFS_FLOAT_M1(v_res); return(sumf); diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c index 86f9103d34..17b7b9f437 100644 --- a/kernel/riscv64/zswap_rvv.c +++ b/kernel/riscv64/zswap_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index df5c916a57..0f9e04869d 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) @@ -91,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index dcf2b081ae..fdc693700a 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif @@ -92,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index afd6944086..7276618c5b 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vint32m2_t -#define VID_V_UINT vid_v_i32m2 -#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT __riscv_vid_v_i32m2 +#define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -121,12 +121,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += 2; diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c index c7d5939495..72e8f2ce2a 100644 --- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -30,33 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -117,14 +117,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON //va1 = VLEV_FLOAT(ao, vl); VLSEG2_FLOAT(&va0, &va1, ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - //VSEV_FLOAT(b, vb, vl); VSSEG2_FLOAT(b, va0, va1, vl); ao += lda * 2; b += vl * 2; diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c index 3c70b63853..e6d36c86d6 100644 --- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -120,12 +120,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += 2; diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c index 706782cf0b..7085cfc379 100644 --- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -32,33 +32,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -117,12 +117,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSEG2_FLOAT(&va0, &va1, ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += lda * 2; diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c index 3486a46480..399124d2e4 100644 --- a/kernel/riscv64/ztrmmkernel_2x2_rvv.c +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEG4_FLOAT __riscv_vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEG4_FLOAT __riscv_vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif // Optimizes the implementation in ../generic/ztrmmkernel_2x2.c diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c index 27409ec25e..92b4b855bf 100644 --- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c index b7ccb1eb3e..383cb883fb 100644 --- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c index 911b81de58..f57e9f1dec 100644 --- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c index db075c29ba..be36134294 100644 --- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -30,25 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c index e121c62739..b1f5ef8f09 100644 --- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif From 6b74bee2f9d7272f1932a9ba9bbd1bda6c122fbf Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 27 Mar 2023 18:59:24 -0700 Subject: [PATCH 12/36] Update TARGET=x280 description. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ecb461786..1f1c0f3edb 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ``` (also known to work on C906) -- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. +- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. ```sh make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran ``` From 18d7afe69daa196902cd68b63cc381aaafc9d26e Mon Sep 17 00:00:00 2001 From: sh-zheng <2294474733@qq.com> Date: Sat, 20 May 2023 01:19:44 +0800 Subject: [PATCH 13/36] Add rvv support for zsymv and active rvv support for zhemv --- kernel/riscv64/KERNEL.x280 | 17 ++- kernel/riscv64/zhemv_LM_rvv.c | 198 +++++++++++++++++++++++++++++++++ kernel/riscv64/zhemv_UV_rvv.c | 199 ++++++++++++++++++++++++++++++++++ kernel/riscv64/zsymv_L_rvv.c | 179 ++++++++++++++++++++++++++++++ kernel/riscv64/zsymv_U_rvv.c | 177 ++++++++++++++++++++++++++++++ 5 files changed, 766 insertions(+), 4 deletions(-) create mode 100644 kernel/riscv64/zhemv_LM_rvv.c create mode 100644 kernel/riscv64/zhemv_UV_rvv.c create mode 100644 kernel/riscv64/zsymv_L_rvv.c create mode 100644 kernel/riscv64/zsymv_U_rvv.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 217d8534e5..86708fe015 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -225,10 +225,19 @@ SSYMV_U_KERNEL = symv_U_rvv.c SSYMV_L_KERNEL = symv_L_rvv.c DSYMV_U_KERNEL = symv_U_rvv.c DSYMV_L_KERNEL = symv_L_rvv.c -CSYMV_U_KERNEL = ../generic/zsymv_k.c -CSYMV_L_KERNEL = ../generic/zsymv_k.c -ZSYMV_U_KERNEL = ../generic/zsymv_k.c -ZSYMV_L_KERNEL = ../generic/zsymv_k.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c new file mode 100644 index 0000000000..e025120e5e --- /dev/null +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + jx = 0; + jy = 0; + ja = 0; + for(j = 0; j < offset; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + ix = jx + inc_x2; + iy = jy + inc_y2; + ia = ja + 2; + i = j + 1; + len = m - i; + if(len > 0){ + gvl = VSETVL(len); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + if(i < m){ + gvl = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 += VFMVFS_FLOAT(v_res); + } + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c new file mode 100644 index 0000000000..0e1ea5436e --- /dev/null +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = VSETVL(j); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + if(i < j){ + gvl = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 += VFMVFS_FLOAT(v_res); + } + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c new file mode 100644 index 0000000000..3bf6210945 --- /dev/null +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; + + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = VSETVL(len); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + + if(i < m){ + gvl = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] += VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] += VFMVFS_FLOAT_M1(v_res); + } + } + y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; + + jx += inc_x; + jy += inc_y; + a_ptr += 2 * lda; + } + + return(0); +} + diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c new file mode 100644 index 0000000000..de1564f758 --- /dev/null +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = VSETVL(j); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + + if(i < j){ + gvl = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] += VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] += VFMVFS_FLOAT_M1(v_res); + } + } + + y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0]; + + a_ptr += 2 * lda; + jx += inc_x; + jy += inc_y; + } + + return(0); +} + From d3bf5a5401e623e107a23fb70151c7102cbd14c7 Mon Sep 17 00:00:00 2001 From: sh-zheng <2294474733@qq.com> Date: Mon, 22 May 2023 22:39:45 +0800 Subject: [PATCH 14/36] Combine two reduction operations of zhe/symv into one, with tail undisturbed setted. --- kernel/riscv64/zhemv_LM_rvv.c | 68 +++++++++++++++++------------------ kernel/riscv64/zhemv_UV_rvv.c | 68 +++++++++++++++++------------------ kernel/riscv64/zsymv_L_rvv.c | 50 +++++++++++++------------- kernel/riscv64/zsymv_U_rvv.c | 52 +++++++++++++-------------- 4 files changed, 119 insertions(+), 119 deletions(-) diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c index e025120e5e..95c6a377ce 100644 --- a/kernel/riscv64/zhemv_LM_rvv.c +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m1() @@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -143,49 +147,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 = VFMVFS_FLOAT(v_res); + if(i < m){ - gvl = VSETVL(m-i); - va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); - va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); - vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); #ifndef HEMVREV - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #else - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #endif - VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); - VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); #ifndef HEMVREV - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #else - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #endif - - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 += VFMVFS_FLOAT(v_res); } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c index 0e1ea5436e..ec06622fcc 100644 --- a/kernel/riscv64/zhemv_UV_rvv.c +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m1() @@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -142,49 +146,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 = VFMVFS_FLOAT(v_res); + if(i < j){ - gvl = VSETVL(j-i); - va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); - va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); - vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); #ifndef HEMVREV - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #else - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #endif - VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); - VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); #ifndef HEMVREV - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #else - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #endif - - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 += VFMVFS_FLOAT(v_res); } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); } y[jy] += temp_r1 * a_ptr[ja]; y[jy+1] += temp_i1 * a_ptr[ja]; diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c index 3bf6210945..cefdea7f6a 100644 --- a/kernel/riscv64/zsymv_L_rvv.c +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -57,6 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -133,38 +137,34 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] = VFMVFS_FLOAT_M1(v_res); if(i < m){ - gvl = VSETVL(m-i); - vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); - vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); - va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); - va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + unsigned int gvl_rem = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); - vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); - vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); - VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); - VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); - vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); - vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); - vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); - vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); - vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); - vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] += VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] += VFMVFS_FLOAT_M1(v_res); } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); } y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c index de1564f758..67b5a649c1 100644 --- a/kernel/riscv64/zsymv_U_rvv.c +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -56,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -129,39 +133,35 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] = VFMVFS_FLOAT_M1(v_res); if(i < j){ - gvl = VSETVL(j-i); - vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); - vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); - va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); - va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); - vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); - vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); - VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); - VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); - vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); - vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); - vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); - vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); - vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); - vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] += VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] += VFMVFS_FLOAT_M1(v_res); - } + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); } y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; From 0954746380a54c4f5c45f892dabef6b7c9aa93c3 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Sun, 4 Jun 2023 20:06:58 -0700 Subject: [PATCH 15/36] remove argument unused during compilation. fix wrong vr = VFMVVF_FLOAT(0, vl); --- Makefile.riscv64 | 2 +- kernel/riscv64/symv_L_rvv.c | 4 ---- kernel/riscv64/symv_U_rvv.c | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index d091984a6c..ce7a271412 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,7 +3,7 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_GENERIC) diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index e87ab22ae3..b27db2e373 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -94,7 +94,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = (m-i); k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLEV_FLOAT(&y[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -157,7 +155,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = (m-i); k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); inc_xv = inc_x * vl; va = VLEV_FLOAT(&a_ptr[i], vl); @@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vl = VSETVL(k); inc_xv = inc_x * vl; inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 3fbc33c893..7e45b1a018 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -95,7 +95,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = j; k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); vy = VLEV_FLOAT(&y[i], vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -158,7 +156,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_xv = inc_x * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLEV_FLOAT(&y[i], vl); va = VLEV_FLOAT(&a_ptr[i], vl); @@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vl = VSETVL(k); inc_xv = inc_x * vl; inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); From e1958eb70529c36d7dc4f3baf9e7bf37524053ab Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Wed, 5 Jul 2023 11:34:00 +0100 Subject: [PATCH 16/36] Fixes RVV masked intrinsics for iamax/iamin/imax/imin kernels Changes masked intrinsics from _m to _mu and reintroduces maskedoff argument. --- kernel/riscv64/iamax_rvv.c | 16 ++++++++-------- kernel/riscv64/iamin_rvv.c | 16 ++++++++-------- kernel/riscv64/imax_rvv.c | 16 ++++++++-------- kernel/riscv64/imin_rvv.c | 16 ++++++++-------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index ef7850a55a..d3508a91d6 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index 56a086fed4..ae1d4f7269 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -46,9 +46,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -72,9 +72,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index 5b60a56f79..33250568d6 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index b49544a1bb..4ce49c3afd 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); From 1e4a3a2b5e111a6a94eb53946fa92c1715c5dd5e Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Wed, 12 Jul 2023 12:55:50 +0100 Subject: [PATCH 17/36] Fixes RVV masked intrinsics for izamax/izamin kernels --- kernel/riscv64/izamax_rvv.c | 18 +++++++++--------- kernel/riscv64/izamin_rvv.c | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e61d0cbec1..e43ded8202 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -77,9 +77,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); @@ -138,9 +138,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); - + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); } diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index 297b3c99a3..cc3c37c8e7 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -47,9 +47,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -74,9 +74,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); From 8df0289db61ea5a3e461c94c51a5798e2dd18b86 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Thu, 20 Jul 2023 15:28:35 +0100 Subject: [PATCH 18/36] Adds tail undisturbed for RVV Level 1 operations During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic. Commit changes intrinsics tail policy to undistrubed. --- kernel/riscv64/amax_rvv.c | 8 +++--- kernel/riscv64/amin_rvv.c | 8 +++--- kernel/riscv64/asum_rvv.c | 8 +++--- kernel/riscv64/dot_rvv.c | 16 +++++------ kernel/riscv64/iamax_rvv.c | 24 ++++++++-------- kernel/riscv64/iamin_rvv.c | 24 ++++++++-------- kernel/riscv64/imax_rvv.c | 24 ++++++++-------- kernel/riscv64/imin_rvv.c | 24 ++++++++-------- kernel/riscv64/izamax_rvv.c | 24 ++++++++-------- kernel/riscv64/izamin_rvv.c | 24 ++++++++-------- kernel/riscv64/max_rvv.c | 8 +++--- kernel/riscv64/min_rvv.c | 8 +++--- kernel/riscv64/nrm2_rvv.c | 8 +++--- kernel/riscv64/sum_rvv.c | 8 +++--- kernel/riscv64/zamax_rvv.c | 8 +++--- kernel/riscv64/zamin_rvv.c | 8 +++--- kernel/riscv64/zasum_rvv.c | 12 ++++---- kernel/riscv64/zdot_rvv.c | 56 ++++++++++++++++++------------------- kernel/riscv64/znrm2_rvv.c | 24 ++++++++-------- kernel/riscv64/zsum_rvv.c | 12 ++++---- 20 files changed, 168 insertions(+), 168 deletions(-) diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c index be0bdbea0c..451fbc834e 100644 --- a/kernel/riscv64/amax_rvv.c +++ b/kernel/riscv64/amax_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } else { @@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c index d4926084b7..5186d7b128 100644 --- a/kernel/riscv64/amin_rvv.c +++ b/kernel/riscv64/amin_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } else { @@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c index 691591e22b..0ea610cbb3 100644 --- a/kernel/riscv64/asum_rvv.c +++ b/kernel/riscv64/asum_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -76,7 +76,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } else { @@ -88,7 +88,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c index 3276695b63..837badf411 100644 --- a/kernel/riscv64/dot_rvv.c +++ b/kernel/riscv64/dot_rvv.c @@ -49,12 +49,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } @@ -69,12 +69,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } else if (1 == inc_y) { @@ -88,12 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } else { @@ -108,12 +108,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index d3508a91d6..8362d7cefd 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -68,12 +68,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -106,11 +106,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } else { @@ -125,11 +125,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index ae1d4f7269..f90dbb5450 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -43,12 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -69,12 +69,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -107,11 +107,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } else { @@ -126,11 +126,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index 33250568d6..b1a77b1783 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } else { @@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index 4ce49c3afd..1de7f3233b 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } else { @@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e43ded8202..e93f0056cc 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -44,13 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -73,13 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -116,11 +116,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); } } else { @@ -138,11 +138,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); } } diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index cc3c37c8e7..b5bc27404d 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -43,13 +43,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,13 +70,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -113,11 +113,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); } } else { @@ -136,11 +136,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); } } diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c index 9315321f4c..745c27bf4c 100644 --- a/kernel/riscv64/max_rvv.c +++ b/kernel/riscv64/max_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } else { @@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c index 158b682fd1..78528fef99 100644 --- a/kernel/riscv64/min_rvv.c +++ b/kernel/riscv64/min_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } else { @@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 42abfa1196..994fadb702 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -79,7 +79,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLEV_FLOAT(x, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); } } else { @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(x, stride_x, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); } } diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c index 9715faf224..c5629197fb 100644 --- a/kernel/riscv64/sum_rvv.c +++ b/kernel/riscv64/sum_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -73,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } else { @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index 615b7519c3..bbb1e876b8 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmax = VFMAXVV_FLOAT(vmax, v0, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); } @@ -101,7 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmax = VFMAXVV_FLOAT(vmax, v0, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); } } diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index a0d36d46f9..c5453121b6 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmin = VFMINVV_FLOAT(vmin, v0, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); } } else { @@ -100,7 +100,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmin = VFMINVV_FLOAT(vmin, v0, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); } } diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c index 1d2f0e1fe0..ebec1b19c8 100644 --- a/kernel/riscv64/zasum_rvv.c +++ b/kernel/riscv64/zasum_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -51,7 +51,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #endif @@ -75,8 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } @@ -93,8 +93,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index 1543c513d0..fa0e89353e 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -36,12 +36,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) @@ -52,12 +52,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -86,14 +86,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSEG_FLOAT(&vx0, &vx1, x, vl); VLSEG_FLOAT(&vy0, &vy1, y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } @@ -107,14 +107,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSEG_FLOAT(&vx0, &vx1, x, vl); VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } } else if (inc_y == 1){ @@ -127,14 +127,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); VLSEG_FLOAT(&vy0, &vy1, y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } }else { @@ -148,14 +148,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } } diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index 5f7873b5a6..d2b27aa8d3 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else @@ -51,10 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif @@ -85,11 +85,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); } } else { @@ -103,11 +103,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); } } diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index 44df112c6b..b41f70eb53 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m4() @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m4_tu #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -69,8 +69,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) VLSEG_FLOAT(&v0, &v1, x, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } else { @@ -82,8 +82,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } From 826a9d5fa47f20f23f42c97385e72e121a2efb4f Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Tue, 25 Jul 2023 11:36:23 +0100 Subject: [PATCH 19/36] Adds tail undisturbed for RVV Level 2 operations During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic. Commit changes intrinsics tail policy to undistrubed. --- kernel/riscv64/gemv_t_rvv.c | 8 ++++---- kernel/riscv64/symv_L_rvv.c | 12 +++++------ kernel/riscv64/symv_U_rvv.c | 12 +++++------ kernel/riscv64/zgemv_t_rvv.c | 40 ++++++++++++++++++------------------ 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c index f0c8348669..9c859aa509 100644 --- a/kernel/riscv64/gemv_t_rvv.c +++ b/kernel/riscv64/gemv_t_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO va = VLEV_FLOAT(a_ptr, vl); vx = VLEV_FLOAT(x_ptr, vl); - vr = VFMACCVV_FLOAT(vr, va, vx, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); @@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO va = VLEV_FLOAT(a_ptr, vl); vx = VLSEV_FLOAT(x_ptr, stride_x, vl); - vr = VFMACCVV_FLOAT(vr, va, vx, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index b27db2e373..888d628a50 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); iy += inc_yv; } @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; } @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; iy += inc_yv; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 7e45b1a018..3cfd3ee4c0 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); iy += inc_yv; } @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; } @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; iy += inc_yv; } diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index 15795cc3a2..2f03805305 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 @@ -90,15 +90,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); #endif j += vl * 2; ix += vl * inc_x * 2; @@ -134,15 +134,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); #endif j += vl * 2; ix += vl * inc_x * 2; From e4586e81b896b85b600c50f9670e59989cbdabf7 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Mon, 4 Dec 2023 11:02:18 +0000 Subject: [PATCH 20/36] [RISC-V] Add RISC-V Vector 128-bit target Current RVV x280 target depends on vlen=512-bits for Level 3 operations. Commit adds generic target that supports vlen=128-bits. New target uses the same scalable kernels as x280 for Level 1&2 operations, and autogenerated kernels for Level 3 operations. Functional correctness of Level 3 operations tested on vlen=128-bits using QEMU v8.1.1 for ctests and BLAS-Tester. --- Makefile.prebuild | 4 + Makefile.riscv64 | 4 + TargetList.txt | 1 + cpuid_riscv64.c | 4 +- getarch.c | 13 +- kernel/riscv64/KERNEL.RISCV64_ZVL128B | 243 +++++ kernel/riscv64/cgemm_kernel_8x4_zvl128b.c | 996 +++++++++++++++++++ kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c | 1102 +++++++++++++++++++++ kernel/riscv64/dgemm_kernel_8x4_zvl128b.c | 492 +++++++++ kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c | 660 ++++++++++++ kernel/riscv64/sgemm_kernel_8x8_zvl128b.c | 791 +++++++++++++++ kernel/riscv64/strmm_kernel_8x8_zvl128b.c | 991 ++++++++++++++++++ kernel/riscv64/zgemm_kernel_4x4_zvl128b.c | 720 ++++++++++++++ kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c | 805 +++++++++++++++ param.h | 39 + 15 files changed, 6863 insertions(+), 2 deletions(-) create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL128B create mode 100644 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c create mode 100644 kernel/riscv64/strmm_kernel_8x8_zvl128b.c create mode 100644 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c create mode 100644 kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c diff --git a/Makefile.prebuild b/Makefile.prebuild index c4f4a26026..b56169da07 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif +ifeq ($(TARGET), RISCV64_ZVL128B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + ifeq ($(TARGET), RISCV64_GENERIC) TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce7a271412..93e270bde7 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +ifeq ($(CORE), RISCV64_ZVL128B) +CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static diff --git a/TargetList.txt b/TargetList.txt index f76f605cc3..5b7a63831e 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -119,6 +119,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) +RISCV64_ZVL128B C910V x280 diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 1b6b62f212..15a539c20e 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 #define CPU_RISCV64_ZVL256B 2 +#define CPU_RISCV64_ZVL128B 3 static char *cpuname[] = { "RISCV64_GENERIC", "C910V", - "CPU_RISCV64_ZVL256B" + "CPU_RISCV64_ZVL256B", + "CPU_RISCV64_ZVL128B" }; int detect(void){ diff --git a/getarch.c b/getarch.c index 7728363472..b8b7ef7e0a 100644 --- a/getarch.c +++ b/getarch.c @@ -1691,7 +1691,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "x280" #else #endif - +#ifdef FORCE_RISCV64_ZVL128B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL128B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL128B " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl128b" +#define CORENAME "RISCV64_ZVL128B" +#endif #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B new file mode 100644 index 0000000000..fec69ee094 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -0,0 +1,243 @@ +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c +SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c + +DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c +DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c + +CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c +CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + +ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c +ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + +CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c +CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + +ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c +ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c new file mode 100644 index 0000000000..bd615389c8 --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,996 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='cgemm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c new file mode 100644 index 0000000000..3268cb810f --- /dev/null +++ b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,1102 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='ctrmm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c new file mode 100644 index 0000000000..a613f0bceb --- /dev/null +++ b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,492 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dgemm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c new file mode 100644 index 0000000000..c1e0da86e1 --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,660 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dtrmm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c new file mode 100644 index 0000000000..ad720e6949 --- /dev/null +++ b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c @@ -0,0 +1,791 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='sgemm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + C[ci + 4 * ldc + 0] += alpha * result8; + C[ci + 4 * ldc + 1] += alpha * result9; + C[ci + 5 * ldc + 0] += alpha * result10; + C[ci + 5 * ldc + 1] += alpha * result11; + C[ci + 6 * ldc + 0] += alpha * result12; + C[ci + 6 * ldc + 1] += alpha * result13; + C[ci + 7 * ldc + 0] += alpha * result14; + C[ci + 7 * ldc + 1] += alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + C[ci + 4 * ldc + 0] += alpha * result4; + C[ci + 5 * ldc + 0] += alpha * result5; + C[ci + 6 * ldc + 0] += alpha * result6; + C[ci + 7 * ldc + 0] += alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/strmm_kernel_8x8_zvl128b.c b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c new file mode 100644 index 0000000000..ef18f036c0 --- /dev/null +++ b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c @@ -0,0 +1,991 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='strmm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + C[ci + 4 * ldc + 0] = alpha * result8; + C[ci + 4 * ldc + 1] = alpha * result9; + C[ci + 5 * ldc + 0] = alpha * result10; + C[ci + 5 * ldc + 1] = alpha * result11; + C[ci + 6 * ldc + 0] = alpha * result12; + C[ci + 6 * ldc + 1] = alpha * result13; + C[ci + 7 * ldc + 0] = alpha * result14; + C[ci + 7 * ldc + 1] = alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + C[ci + 4 * ldc + 0] = alpha * result4; + C[ci + 5 * ldc + 0] = alpha * result5; + C[ci + 6 * ldc + 0] = alpha * result6; + C[ci + 7 * ldc + 0] = alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c new file mode 100644 index 0000000000..0776f03fdd --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c @@ -0,0 +1,720 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='zgemm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c new file mode 100644 index 0000000000..d7d5e5feab --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c @@ -0,0 +1,805 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='ztrmm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/param.h b/param.h index c5c70b78e3..a1a70400c5 100644 --- a/param.h +++ b/param.h @@ -3123,6 +3123,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef RISCV64_ZVL128B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef RISCV64_ZVL256B #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 From 4a12cf53ec116c06e5d74073b54a3bca6046cb17 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Mon, 4 Dec 2023 11:13:35 +0000 Subject: [PATCH 21/36] [RISC-V] Improve RVV kernel generator LMUL usage The RVV kernel generation script uses the provided LMUL to increase the number of accumulator registers. Since the effect of the LMUL is to group together the vector registers into larger ones, it actually should be used as a multiplier in the calculation of vlenmax. At the moment, no matter what LMUL is provided, the generated kernels would only set the maximum number of vector elements equal to VLEN/SEW. Commit changes the use of LMUL to properly adjust vlenmax. Note that an increase in LMUL results in a decrease in the number of effective vector registers. --- kernel/riscv64/generate_kernel.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py index e2ce97971a..8be7c9f9cc 100755 --- a/kernel/riscv64/generate_kernel.py +++ b/kernel/riscv64/generate_kernel.py @@ -197,13 +197,13 @@ def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ): dest.write("ai += {M}*2;") dest.write() - - accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k", a_regs=a_regs*2, accumulation_regs=accumulation_regs*2 ) pass_regs = (accumulation_regs + a_regs)*2 - tmp_regs = 32-pass_regs + tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs if tmp_regs < 2: raise RuntimeError("Complex kernel would use too many registers!") @@ -337,10 +337,12 @@ def generate_gemm_kernel( settings, OUTPUT ): M = settings['M'].value N = settings['N'].value - vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value / + settings['ELEN_PARAM'].value) a_regs = max(int(M/vlenmax), 1) - accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N required_regs = accumulation_regs + a_regs if is_complex: required_regs = required_regs * 2 + 2 @@ -380,9 +382,9 @@ def generate_gemm_kernel( settings, OUTPUT ): '''.format(tail_policy=settings['tail_policy'].value)) - if required_regs > 32: - raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format( - required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '') + if required_regs > (32 // settings['LMUL_ACC'].value): + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value )) TRMM = (settings['op'].value == 'trmm') @@ -448,7 +450,8 @@ def generate_gemm_kernel( settings, OUTPUT ): def generate_M_tails( dest, settings, M, N ): M_tail = int(M/2) M_tail_min = settings['M_tail_scalar_from'].value - vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value + / settings['ELEN_PARAM'].value ) TRMM = (settings['op'].value == 'trmm') is_complex = settings['complex'].value generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real @@ -667,4 +670,4 @@ def OUTPUT(*args, **kwargs): ERROR("unsupported kernel type {}".format(settings['op'])) if __name__ == "__main__": - main() \ No newline at end of file + main() From 9edb805e645d3530e907864e242a3f64a881b28a Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Tue, 16 Jan 2024 14:24:18 +0000 Subject: [PATCH 22/36] fix builds with t-head toolchains that use old versions of the intrinsics spec --- common_riscv64.h | 17 +++++- cpuid_riscv64.c | 4 +- kernel/riscv64/amax_vector.c | 18 +++--- kernel/riscv64/amin_vector.c | 18 +++--- kernel/riscv64/asum_vector.c | 20 ++++--- kernel/riscv64/axpby_vector.c | 16 +++--- kernel/riscv64/axpy_vector.c | 12 ++-- kernel/riscv64/copy_vector.c | 14 ++--- kernel/riscv64/dot_vector.c | 44 +++++++++------ kernel/riscv64/gemv_n_vector.c | 24 ++++---- kernel/riscv64/gemv_t_vector.c | 42 ++++++++------ kernel/riscv64/iamax_vector.c | 80 +++++++++++++++------------ kernel/riscv64/iamin_vector.c | 82 ++++++++++++++++------------ kernel/riscv64/imax_vector.c | 82 ++++++++++++++++------------ kernel/riscv64/imin_vector.c | 80 ++++++++++++++++----------- kernel/riscv64/izamax_vector.c | 94 ++++++++++++++++++-------------- kernel/riscv64/izamin_vector.c | 92 ++++++++++++++++++------------- kernel/riscv64/max_vector.c | 20 ++++--- kernel/riscv64/min_vector.c | 20 ++++--- kernel/riscv64/nrm2_vector.c | 64 ++++++++++++---------- kernel/riscv64/rot_vector.c | 36 ++++++------ kernel/riscv64/scal_vector.c | 14 ++--- kernel/riscv64/sum_vector.c | 32 +++++------ kernel/riscv64/swap_vector.c | 10 ++-- kernel/riscv64/symv_L_vector.c | 56 +++++++++++-------- kernel/riscv64/symv_U_vector.c | 60 +++++++++++--------- kernel/riscv64/zamax_vector.c | 25 +++++---- kernel/riscv64/zamin_vector.c | 25 +++++---- kernel/riscv64/zasum_vector.c | 22 +++++--- kernel/riscv64/zaxpby_vector.c | 32 +++++------ kernel/riscv64/zaxpy_vector.c | 20 +++---- kernel/riscv64/zcopy_vector.c | 12 ++-- kernel/riscv64/zdot_vector.c | 60 +++++++++++--------- kernel/riscv64/zgemv_n_vector.c | 28 +++++----- kernel/riscv64/zgemv_t_vector.c | 56 +++++++++++-------- kernel/riscv64/zhemv_LM_vector.c | 60 +++++++++++--------- kernel/riscv64/zhemv_UV_vector.c | 60 +++++++++++--------- kernel/riscv64/znrm2_vector.c | 51 +++++++++-------- kernel/riscv64/zrot_vector.c | 36 ++++++------ kernel/riscv64/zscal_vector.c | 32 +++++------ kernel/riscv64/zsum_vector.c | 16 +++--- kernel/riscv64/zswap_vector.c | 10 ++-- 42 files changed, 900 insertions(+), 696 deletions(-) diff --git a/common_riscv64.h b/common_riscv64.h index de79c8cabb..f11e8b75d4 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,12 +91,23 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) # include +#endif + +#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) +// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this +#define RISCV_0p10_INTRINSICS +#define RISCV_RVV(x) x +#else +#define RISCV_RVV(x) __riscv_ ## x +#endif + +#if defined(C910V) || defined(RISCV64_ZVL256B) # if !defined(DOUBLE) -# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) # else -# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) # endif #else # define EXTRACT_FLOAT(v) (v[0]) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 1b6b62f212..928b5ba923 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 -#define CPU_RISCV64_ZVL256B 2 +#define CPU_x280 2 +#define CPU_RISCV64_ZVL256B 3 static char *cpuname[] = { "RISCV64_GENERIC", "C910V", + "x280", "CPU_RISCV64_ZVL256B" }; diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index 81a39af329..b66d4871e9 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -49,15 +49,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index c8ba75f4a5..1c541f0fd1 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -48,15 +48,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index d10bf99e6b..995dbf9a13 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -49,16 +49,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index b77cb58fb3..386c4a5f1c 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -48,15 +48,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) -#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMULVF_FLOAT JOIN(__riscv_vfmul, _vf_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 3447107a64..e99ca85420 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -49,13 +49,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) -#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index 710e8670a4..ccbd6e482b 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -47,12 +47,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/4){ BLASLONG inc_xv = inc_x * gvl; - BLASLONG gvl3 = gvl * 3; + unsigned int gvl3 = gvl * 3; BLASLONG inc_xv3 = inc_xv * 3; for(i=0,j=0; i #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m8 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) #else -#define VSETVL(n) __riscv_vsetvl_e64m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m8 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index baf3d8f699..3b467a5868 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index f3b9221959..cd89c63ec7 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -27,35 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 9977e27418..894c6a6433 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 4301528bd5..2dee5ab29a 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -53,19 +53,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) +#endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) -#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 095b1c3dfc..df9a7a7e13 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -55,19 +55,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) +#endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) -#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 9136f00378..fca904d6ae 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -53,17 +53,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 404f51fb32..d5ad974cf6 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 20bfe74ec2..d19e511187 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 9da60acb0e..9e4a67b710 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) #endif diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index 57542714a1..13b8fe378b 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index f4acad7704..104d3865d2 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index 1794540947..5d85ab3a48 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m2_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m2_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl) +#else +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -93,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); - v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); j += inc_av; ix += inc_xv; @@ -117,8 +125,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); - v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); } temp_r = VFMVFS_FLOAT(v_res_r); diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index e025120e5e..117db7d840 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 0e1ea5436e..7c6b63bf30 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index 437bf4246e..8614f75391 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -52,37 +52,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VFABS JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VMFNE JOIN(__riscv_vmfne_vf_f,ELEN, LMUL, _b, MLEN) -#define VMFGT JOIN(__riscv_vmfgt_vv_f,ELEN, LMUL, _b, MLEN) -#define VMFEQ JOIN(__riscv_vmfeq_vv_f,ELEN, LMUL, _b, MLEN) -#define VCPOP JOIN(__riscv_vcpop, _m_b, MLEN, _, _) -#define VFREDMAX JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) -#define VFIRST JOIN(__riscv_vfirst, _m_b, MLEN, _, _) -#define VRGATHER JOIN(__riscv_vrgather, _vx_f, ELEN, LMUL, _) -#define VFDIV JOIN(__riscv_vfdiv, _vf_f, ELEN, LMUL, _) -#define VFDIV_M JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _mu) -#define VFMUL JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _) -#define VFMACC JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _) -#define VFMACC_M JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _mu) -#define VMSOF JOIN(__riscv_vmsof, _m_b, MLEN, _, _) -#define VMANDN JOIN(__riscv_vmandn, _mm_b, MLEN, _, _) -#define VFREDUSUM JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFABS JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VMFNE JOIN(RISCV_RVV(vmfne_vf_f),ELEN, LMUL, _b, MLEN) +#define VMFGT JOIN(RISCV_RVV(vmfgt_vv_f),ELEN, LMUL, _b, MLEN) +#define VMFEQ JOIN(RISCV_RVV(vmfeq_vv_f),ELEN, LMUL, _b, MLEN) +#define VCPOP JOIN(RISCV_RVV(vcpop), _m_b, MLEN, _, _) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAX(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl) +#define VFREDUSUM(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl) +#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _m) +#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _m) +#else +#define VFREDMAX JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFREDUSUM JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _mu) +#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _mu) +#endif +#define VFIRST JOIN(RISCV_RVV(vfirst), _m_b, MLEN, _, _) +#define VRGATHER JOIN(RISCV_RVV(vrgather), _vx_f, ELEN, LMUL, _) +#define VFDIV JOIN(RISCV_RVV(vfdiv), _vf_f, ELEN, LMUL, _) +#define VFMUL JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _) +#define VFMACC JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _) +#define VMSOF JOIN(RISCV_RVV(vmsof), _m_b, MLEN, _, _) +#define VMANDN JOIN(RISCV_RVV(vmandn), _mm_b, MLEN, _, _) #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif -#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v) +#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index c3afbc7cc6..50751b3438 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 5d9ab7b288..2034aafaae 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c index 7aab151051..ca0b02b5c2 100644 --- a/kernel/riscv64/zsum_vector.c +++ b/kernel/riscv64/zsum_vector.c @@ -53,16 +53,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index d8980602d7..02c98b5888 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { From b193ea3d7b6ae9950d6c7c73937661318a719c26 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 18 Jan 2024 22:11:12 +0300 Subject: [PATCH 23/36] Fix BLAS and LAPACK tests for RVV 1.0 target, update to 0.12.0 intrincics * Update intrincics API to 0.12.0 version (Stride Segment Loads/Stores) * Fixed nrm2, axpby, ncopy, zgemv and scal kernels * Added zero size checks --- kernel/riscv64/axpby_rvv.c | 8 +- kernel/riscv64/copy_rvv.c | 2 +- kernel/riscv64/gemm_ncopy_8_rvv.c | 51 ++++- kernel/riscv64/gemm_tcopy_8_rvv.c | 71 +++--- kernel/riscv64/izamax_rvv.c | 23 +- kernel/riscv64/izamin_rvv.c | 23 +- kernel/riscv64/nrm2_rvv.c | 237 +++++++++++++------ kernel/riscv64/scal_rvv.c | 49 ++-- kernel/riscv64/symv_U_rvv.c | 2 +- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 39 ++-- kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 39 ++-- kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 37 +-- kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 33 ++- kernel/riscv64/zamax_rvv.c | 23 +- kernel/riscv64/zamin_rvv.c | 23 +- kernel/riscv64/zaxpby_rvv.c | 63 ++++-- kernel/riscv64/zaxpy_rvv.c | 76 +++++-- kernel/riscv64/zcopy_rvv.c | 40 ++-- kernel/riscv64/zdot_rvv.c | 49 +++- kernel/riscv64/zgemm_beta_rvv.c | 27 ++- kernel/riscv64/zgemm_ncopy_4_rvv.c | 72 ++++-- kernel/riscv64/zgemm_ncopy_rvv_v1.c | 18 +- kernel/riscv64/zgemm_tcopy_4_rvv.c | 60 +++-- kernel/riscv64/zgemm_tcopy_rvv_v1.c | 18 +- kernel/riscv64/zgemmkernel_rvv_v1x4.c | 144 +++++++++--- kernel/riscv64/zgemv_n_rvv.c | 50 ++-- kernel/riscv64/zgemv_t_rvv.c | 44 ++-- kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 33 ++- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 33 ++- kernel/riscv64/znrm2_rvv.c | 301 +++++++++++++++++++------ kernel/riscv64/zrot_rvv.c | 95 ++++++-- kernel/riscv64/zscal_rvv.c | 66 ++++-- kernel/riscv64/zsum_rvv.c | 23 +- kernel/riscv64/zswap_rvv.c | 62 ++--- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 33 ++- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 33 ++- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 31 ++- kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 110 ++++++--- kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 26 +-- param.h | 2 +- 46 files changed, 1628 insertions(+), 709 deletions(-) diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index a1dbdb0e42..d7fb86eab6 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * { FLOAT_V_T vx, vy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( beta == 0.0 ) { if ( alpha == 0.0 ) { @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_y = inc_y * sizeof(FLOAT); size_t vl = VSETVL(n); vy = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) { + for ( ; n > 0; n -= vl, y += vl*inc_y) { vl = VSETVL(n); VSSEV_FLOAT(y, stride_y, vy, vl); } @@ -126,10 +126,12 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } else { if ((1 == inc_x) && (1 == inc_y)) { - for (size_t vl; n > 0; n -= vl, y += vl) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); vy = VLEV_FLOAT(y, vl); vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); VSEV_FLOAT (y, vy, vl); } } else if (1 == inc_x) { diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c index 041fd2daeb..9d4b840952 100644 --- a/kernel/riscv64/copy_rvv.c +++ b/kernel/riscv64/copy_rvv.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); FLOAT_V_T v0; diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c index 3030d67fbc..c652ab0c00 100644 --- a/kernel/riscv64/gemm_ncopy_8_rvv.c +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -30,19 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/gemm_ncopy_8.c @@ -57,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) FLOAT *b_offset; FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -87,7 +103,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v7 = VLEV_FLOAT(a_offset7, vl); v8 = VLEV_FLOAT(a_offset8, vl); - VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); a_offset1 += vl; a_offset2 += vl; @@ -116,7 +141,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v3 = VLEV_FLOAT(a_offset3, vl); v4 = VLEV_FLOAT(a_offset4, vl); - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); a_offset1 += vl; a_offset2 += vl; @@ -137,7 +167,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v1 = VLEV_FLOAT(a_offset1, vl); v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += vl; a_offset2 += vl; diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 080a873123..4742ae6a75 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VLSEV_FLOAT __riscv_vlse32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VLSEV_FLOAT __riscv_vlse64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 8; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 8; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 32; @@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 8; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 16; @@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e93f0056cc..32f66a7a7a 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -32,10 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -61,10 +63,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -93,6 +97,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(max_index); FLOAT_V_T vx0, vx1, v_max; + FLOAT_VX2_T vxx2; UINT_V_T v_max_index; MASK_T mask; @@ -107,7 +112,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -129,7 +137,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index b5bc27404d..d34b220fa6 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -60,8 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -90,6 +94,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(min_index); FLOAT_V_T vx0, vx1, v_min; + FLOAT_VX2_T vxx2; UINT_V_T v_min_index; MASK_T mask; @@ -104,7 +109,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -127,7 +135,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 994fadb702..3eb4238492 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -26,78 +26,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m8 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define ABS fabsf -#else -#define VSETVL(n) __riscv_vsetvl_e64m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m8 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 + +#if defined(DOUBLE) +#define VSETVL __riscv_vsetvl_e64m4 +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define MASK_T vbool16_t +#define VFABS __riscv_vfabs_v_f64m4 +#define VMFNE __riscv_vmfne_vf_f64m4_b16 +#define VMFGT __riscv_vmfgt_vv_f64m4_b16 +#define VMFEQ __riscv_vmfeq_vf_f64m4_b16 +#define VCPOP __riscv_vcpop_m_b16 +#define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1 +#define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1 +#define VFIRST __riscv_vfirst_m_b16 +#define VRGATHER __riscv_vrgather_vx_f64m4 +#define VFDIV __riscv_vfdiv_vv_f64m4 +#define VFDIV_M __riscv_vfdiv_vv_f64m4_mu +#define VFMUL __riscv_vfmul_vv_f64m4 +#define VFMUL_M __riscv_vfmul_vv_f64m4_mu +#define VFMACC __riscv_vfmacc_vv_f64m4 +#define VFMACC_M __riscv_vfmacc_vv_f64m4_mu +#define VMSBF __riscv_vmsbf_m_b16 +#define VMSOF __riscv_vmsof_m_b16 +#define VMAND __riscv_vmand_mm_b16 +#define VMANDN __riscv_vmand_mm_b16 +#define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1 +#define VMERGE __riscv_vmerge_vvm_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v) #define ABS fabs +#else +#define VSETVL __riscv_vsetvl_e32m4 +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define MASK_T vbool8_t +#define VFABS __riscv_vfabs_v_f32m4 +#define VMFNE __riscv_vmfne_vf_f32m4_b8 +#define VMFGT __riscv_vmfgt_vv_f32m4_b8 +#define VMFEQ __riscv_vmfeq_vf_f32m4_b8 +#define VCPOP __riscv_vcpop_m_b8 +#define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1 +#define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1 +#define VFIRST __riscv_vfirst_m_b8 +#define VRGATHER __riscv_vrgather_vx_f32m4 +#define VFDIV __riscv_vfdiv_vv_f32m4 +#define VFDIV_M __riscv_vfdiv_vv_f32m4_mu +#define VFMUL __riscv_vfmul_vv_f32m4 +#define VFMUL_M __riscv_vfmul_vv_f32m4_mu +#define VFMACC __riscv_vfmacc_vv_f32m4 +#define VFMACC_M __riscv_vfmacc_vv_f32m4_mu +#define VMSBF __riscv_vmsbf_m_b8 +#define VMSOF __riscv_vmsof_m_b8 +#define VMAND __riscv_vmand_mm_b8 +#define VMANDN __riscv_vmand_mm_b8 +#define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1 +#define VMERGE __riscv_vmerge_vvm_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v) +#define ABS fabsf #endif - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - - if( n <= 0 ) return(0.0); - if(n == 1) return (ABS(x[0])); - - FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_res; - FLOAT ssq = 0.0; - - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - - vr = VFMVVF_FLOAT(0, vlmax); - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); - - v0 = VLEV_FLOAT(x, vl); - - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + BLASLONG i=0; + + if (n <= 0 || inc_x <= 0) return(0.0); + if(n == 1) return (ABS(x[0])); + + unsigned int gvl = 0; + + MASK_T nonzero_mask; + MASK_T scale_mask; + + gvl = VSETVL(n); + FLOAT_V_T v0; + FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); + FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); + + FLOAT scale = 0; + FLOAT ssq = 0; + unsigned int stride_x = inc_x * sizeof(FLOAT); + int idx = 0; + + if( n >= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i 0; n -= vl, x += vl * inc_x) { - vl = VSETVL(n); - - v0 = VLSEV_FLOAT(x, stride_x, vl); - - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + //finish any tail using scalar ops + i*=gvl*inc_x; + n*=inc_x; + while(i < n){ + if ( x[i] != 0.0 ){ + FLOAT absxi = ABS( x[i] ); + if ( scale < absxi ){ + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else{ + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + + i += inc_x; } - } - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); + return(scale * sqrt(ssq)); +} - ssq = VFMVFS_FLOAT_M1(v_res); - return sqrt(ssq); -} diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index 2e2cfd31e4..2c273fb634 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() #define FLOAT_V_T vfloat32m8_t #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 @@ -38,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() #define FLOAT_V_T vfloat64m8_t #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 @@ -54,26 +56,41 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T v0; if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); - - v0 = VLEV_FLOAT(x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSEV_FLOAT(x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + VSEV_FLOAT(x, v0, vl); + } } - - } else { + else { + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + } + } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); - for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { - vl = VSETVL(n); - - v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSSEV_FLOAT(x, stride_x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } } - } return 0; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 3cfd3ee4c0..bcd2f69817 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; - + BLASLONG m1 = m - offset; if(inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 886af0c3b7..869561fb37 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -140,6 +144,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; a += (m - 1) * m * 2; @@ -153,7 +158,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -165,12 +172,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = 0; k < i; k ++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -182,7 +193,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index ddeef966c0..da443cfba8 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -137,6 +141,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; @@ -149,7 +154,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -161,12 +168,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = i + 1; k < m; k++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -178,7 +189,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 4c83bbaa3b..32e481036d 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -31,13 +31,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +46,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +135,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -147,7 +150,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -159,13 +164,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = i + 1; k < n; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -177,7 +186,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index b368eefb99..81cc418186 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -42,10 +45,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +139,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -149,7 +156,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B pcj = c; for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -161,13 +170,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = 0; k < i; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -179,7 +192,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index bbb1e876b8..180cf059a7 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmax; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -95,7 +103,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index c5453121b6..56a467502b 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmin; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -94,7 +102,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index e0da553110..66e38c1e47 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -35,6 +35,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 @@ -42,13 +45,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 @@ -56,10 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) @@ -74,6 +80,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG stride_x = inc_x2 * sizeof(FLOAT); BLASLONG stride_y = inc_y2 * sizeof(FLOAT); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if ( beta_r == 0.0 && beta_i == 0.0) { @@ -81,10 +88,12 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL { size_t vl = VSETVL(n); FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) + vxx2 = VSET_VX2(vxx2, 0, temp); + vxx2 = VSET_VX2(vxx2, 1, temp); + for ( ; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); } } else @@ -92,7 +101,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); @@ -100,20 +112,26 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } } else { FLOAT_V_T v0, v1; + FLOAT_VX2_T v_x2; if ( alpha_r == 0.0 && alpha_i == 0.0 ) { for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vy1, beta_i, vl); v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); @@ -121,7 +139,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMULVF_FLOAT(vy1, beta_r, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } else @@ -129,8 +149,14 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); @@ -142,7 +168,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } } diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c index 3f75898e04..0db32df101 100644 --- a/kernel/riscv64/zaxpy_rvv.c +++ b/kernel/riscv64/zaxpy_rvv.c @@ -30,19 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -53,14 +59,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if(da_r == 0.0 && da_i == 0.0) return(0); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if(inc_x == 1 && inc_y == 1) { for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); @@ -72,7 +85,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1) { @@ -82,8 +97,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -96,7 +116,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1) { @@ -106,8 +128,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -120,7 +147,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -131,8 +160,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -145,7 +179,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c index bd94810ce6..13879f03b8 100644 --- a/kernel/riscv64/zcopy_rvv.c +++ b/kernel/riscv64/zcopy_rvv.c @@ -34,11 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 #define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T_M4 vfloat32m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T_M4 vfloat32m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T_M8 vfloat64m8_t @@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 #define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T_M4 vfloat64m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T_M4 vfloat64m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); if(inc_x == 1 && inc_y == 1) { @@ -70,34 +70,34 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) }else if (1 == inc_x) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSEG_FLOAT_M4(&vr, &vi, x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSEG_FLOAT_M4(x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } else if (1 == inc_y) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSEG_FLOAT_M4(y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSEG_FLOAT_M4(y, vx2, vl); } } else { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index fa0e89353e..13bc2ee396 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -71,6 +75,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vyx2; size_t vlmax_m1 = VSETVL_MAX_M1; v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); @@ -83,8 +88,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -104,8 +114,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -124,8 +139,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -145,8 +165,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c index b94b5f4bf5..ee334801be 100644 --- a/kernel/riscv64/zgemm_beta_rvv.c +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -41,8 +41,11 @@ #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 @@ -50,8 +53,11 @@ #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 @@ -68,6 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT *c_offset; size_t vl; FLOAT_V_T vr, vi, v1, v2, v3, v4; + FLOAT_VX2_T vx2; ldc *= 2; c_offset = c; @@ -77,6 +84,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vl = VSETVL(m); vr = VFMVVF_FLOAT(0.0, vl); vi = VFMVVF_FLOAT(0.0, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); for( ; n > 0; n--, c += ldc) { c_offset = c; @@ -84,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VSSEG_FLOAT(c_offset, vr, vi, vl); + VSSEG_FLOAT(c_offset, vx2, vl); } } @@ -96,7 +105,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VLSEG_FLOAT(&vr, &vi, c_offset, vl); + vx2 = VLSEG_FLOAT(c_offset, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); v1 = VFMULVF_FLOAT(vr, beta_r, vl); v2 = VFMULVF_FLOAT(vi, beta_i, vl); @@ -107,7 +118,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vr = VFSUBVV_FLOAT(v1, v2, vl); vi = VFADDVV_FLOAT(v3, v4, vl); - VSSEG_FLOAT(c_offset, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(c_offset, vx2, vl); } } diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c index d50a4b8d55..dce98752ef 100644 --- a/kernel/riscv64/zgemm_ncopy_4_rvv.c +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -29,18 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VGET_VX2 __riscv_vget_v_f32m1x2_f32m1 +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VGET_VX2 __riscv_vget_v_f64m1x2_f64m1 +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/zgemm_ncopy_4.c @@ -53,7 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ FLOAT *boffset; - FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42; + FLOAT_VX2_T v1x2, v2x2, v3x2, v4x2; + FLOAT_VX4_T vxx4; + FLOAT_VX8_T vxx8; size_t vl; aoffset = a; @@ -69,12 +83,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); - VLSEG2_FLOAT(&v31, &v32, aoffset3, vl); - VLSEG2_FLOAT(&v41, &v42, aoffset4, vl); - - VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + v3x2 = VLSEG2_FLOAT(aoffset3, vl); + v4x2 = VLSEG2_FLOAT(aoffset4, vl); + + vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0)); + vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1)); + vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0)); + vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1)); + vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0)); + vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1)); + vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0)); + vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1)); + + VSSEG8_FLOAT(boffset, vxx8, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -91,10 +114,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + + vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0)); + vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1)); + vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0)); + vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1)); - VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl); + VSSEG4_FLOAT(boffset, vxx4, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -108,9 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); - VSSEG2_FLOAT(boffset, v11, v12, vl); + VSSEG2_FLOAT(boffset, v1x2, vl); aoffset1 += vl * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c index 1d3b8d3b71..275daa5f20 100644 --- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -30,14 +30,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ @@ -48,7 +48,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ FLOAT *a_offset1; FLOAT *b_offset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a_offset += vl * lda * 2; for(i = m; i > 0; i--) { - VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(b_offset, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += 2; b_offset += vl * 2; diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index 8c35b5616e..cfafbf0dc7 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -30,25 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ @@ -60,7 +66,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ IFLOAT *boffset, *boffset1, *boffset2, *boffset3; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -81,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -91,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -101,8 +111,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -119,8 +129,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -129,8 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -139,8 +149,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); //aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c index 7a085269c8..96e9865028 100644 --- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -29,14 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -47,7 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *aoffset1; IFLOAT *boffset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) aoffset += vl * 2; for(i = m; i > 0; i--) { - VLSEG2_FLOAT(&v0, &v1, aoffset1, vl); - VSSEG2_FLOAT(boffset, v0, v1, vl); + vx2 = VLSEG2_FLOAT(aoffset1, vl); + VSSEG2_FLOAT(boffset, vx2, vl); aoffset1 += lda * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c index 41399cf79b..77e012ff56 100644 --- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -30,20 +30,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -80,6 +86,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b BLASLONG i,j,k; FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -109,10 +116,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -137,7 +148,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -162,7 +175,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -211,7 +226,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -237,35 +254,57 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); - VLSEG2_FLOAT(&va0, &va1, C2, vl); - VLSEG2_FLOAT(&va2, &va3, C3, vl); + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + vax2 = VLSEG2_FLOAT(C2, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C3, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -294,9 +333,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -311,7 +355,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -326,7 +372,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -356,7 +404,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -372,20 +422,31 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -409,9 +470,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -420,7 +486,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -430,7 +498,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -448,7 +518,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -458,12 +530,18 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); C0 += vl * 2; } diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c index 4a40c30a79..f14ef5ba8c 100644 --- a/kernel/riscv64/zgemv_n_rvv.c +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -62,6 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a_ptr; FLOAT temp_r, temp_i; FLOAT_V_T va0, va1, vy0, vy1; + FLOAT_VX2_T vax2, vyx2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -73,7 +80,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -84,7 +94,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -113,7 +126,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSEG_FLOAT(y, vy0, vy1, vl); + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } @@ -123,7 +139,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -134,7 +152,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -163,7 +183,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } return(0); diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index 2f03805305..1c89a9f728 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -32,9 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f32m4_f32m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -46,9 +48,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f64m4_f64m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -66,6 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT_V_T va0, va1, vx0, vx1, vr, vi; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vax2; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; //BLASLONG stride_a = sizeof(FLOAT) * 2; @@ -73,6 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL(m); @@ -86,8 +92,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSEG_FLOAT(&x[ix], vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -104,9 +115,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) @@ -130,8 +141,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -148,9 +164,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index 79b20a6467..97013895ae 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -81,6 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -96,8 +103,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posY * 2 + posX * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -111,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index a86815275e..59029e9e59 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -79,6 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -94,8 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -108,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index d2b27aa8d3..32f67758a1 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -28,95 +28,248 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define MASK_T vbool8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define MASK_T vbool16_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif -// TODO: Should single precision use the widening MAC, or perhaps all should be double? - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i=0, j=0; - if ( n <= 0 ) return(0.0); - - FLOAT_V_T vr, v0, v1; - FLOAT_V_T_M1 v_max, v_res; - FLOAT scale = 0.0, ssq = 0.0; - - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); + if (n <= 0 || inc_x <= 0) return(0.0); - vr = VFMVVF_FLOAT(0, vlmax); + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; if (inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - - VLSEG_FLOAT(&v0, &v1, x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for (i=0,j=0; i 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + //ssq in vector vr + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + j += gvl; + idx += inc_v; + } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + //tail + if (j < n) { + gvl = VSETVL(n-j); + v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if(index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + } else { // found greater element + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) {//no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); } - } - - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); - - ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - - return(scale * sqrt(ssq)); + return(scale * sqrt(ssq)); } diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c index ee81bfe915..1d53906849 100644 --- a/kernel/riscv64/zrot_rvv.c +++ b/kernel/riscv64/zrot_rvv.c @@ -30,28 +30,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -63,6 +69,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if (n <= 0) return(0); FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2, vtx2; if (inc_x == 0 && inc_y == 0) { BLASLONG i=0; @@ -93,8 +100,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -105,8 +117,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1){ @@ -115,8 +132,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -127,8 +149,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1){ @@ -137,8 +164,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -149,8 +181,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -160,8 +197,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -172,8 +214,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 779fab68c3..2586c60366 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 @@ -43,10 +46,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -61,6 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F FLOAT_V_T vt, vr, vi; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); size_t vlmax = VSETVL_MAX; + FLOAT_VX2_T vx2; if(da_r == 0.0 && da_i == 0.0) { @@ -71,16 +78,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - VSSEG_FLOAT(x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } @@ -89,12 +98,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vi, -da_i, vl); vi = VFMULVF_FLOAT(vr, da_i, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else if(da_i == 0.0) { @@ -102,12 +116,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vr = VFMULVF_FLOAT(vr, da_r, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else { @@ -117,14 +135,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vr, &vi, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSEG_FLOAT(x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { @@ -132,14 +154,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } } diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index b41f70eb53..489188bd56 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -32,8 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -44,8 +46,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -59,6 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(sumf); FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vlmax = VSETVL_MAX; FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); @@ -67,7 +72,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); @@ -80,7 +88,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c index 17b7b9f437..c2adf5e05d 100644 --- a/kernel/riscv64/zswap_rvv.c +++ b/kernel/riscv64/zswap_rvv.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n <= 0) return(0); - FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if (inc_x == 0 && inc_y == 0) { if (n & 1) { @@ -75,8 +75,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); - VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vyx2, vl); } y[0] = temp[0]; y[1] = temp[1]; @@ -92,8 +92,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); - VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vxx2, vl); } x[0] = temp[0]; x[1] = temp[1]; @@ -103,11 +103,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_x == 1){ @@ -116,11 +116,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_y == 1){ @@ -129,11 +129,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } else { @@ -143,11 +143,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index 0f9e04869d..f4d8061909 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -70,6 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -85,15 +92,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index fdc693700a..069551bb0e 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -71,6 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -86,15 +93,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + 0 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index 7276618c5b..ae664561b4 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vint32m2_t #define VID_V_UINT __riscv_vid_v_i32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -69,6 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -98,8 +103,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X > posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -119,7 +124,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -128,7 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c index 72e8f2ce2a..ab8d343373 100644 --- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -32,11 +32,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -46,11 +48,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -65,6 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; #ifdef UNIT @@ -101,8 +106,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON else if (X < posY) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; @@ -115,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (unsigned int j = 0; j < vl; j++) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +132,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c index e6d36c86d6..ba6e63b965 100644 --- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -67,6 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -96,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X < posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -118,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -127,7 +135,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c index 7085cfc379..a624fff543 100644 --- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -34,11 +34,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -48,11 +50,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -66,6 +70,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, j, js, X; FLOAT *ao; + + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; #ifdef UNIT VBOOL_T vbool_eq; @@ -103,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else if (X > posY) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; X++; @@ -115,7 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +133,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c index 92b4b855bf..db5f06af85 100644 --- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -30,10 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 @@ -41,10 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -85,6 +91,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b off = 0; #endif + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -130,10 +137,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -158,7 +169,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -183,7 +196,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -233,7 +248,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -262,25 +279,37 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); va0 = VFMULVF_FLOAT(vres4, alphar, vl); va1 = VFMULVF_FLOAT(vres5, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMULVF_FLOAT(vres6, alphar, vl); va3 = VFMULVF_FLOAT(vres7, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -342,10 +371,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -360,7 +393,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -375,7 +410,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -405,7 +442,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -425,13 +464,19 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -487,10 +532,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -500,7 +549,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -510,7 +561,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -530,7 +583,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -545,7 +600,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c index 383cb883fb..36cec711d8 100644 --- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG stride_lda = sizeof(FLOAT)*lda*2; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; size_t vl; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); ao += 2; @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c index f57e9f1dec..3a7bdb522a 100644 --- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); b += vl * 2; ao += lda * 2; @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c index be36134294..2a158d4dea 100644 --- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -31,20 +31,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -62,7 +62,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -83,9 +83,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT for (unsigned int j = 0; j < vl; j++) { compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); ao += 2; b += vl * 2; } @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c index b1f5ef8f09..4b3319588a 100644 --- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -81,9 +81,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/param.h b/param.h index c5c70b78e3..d93221d285 100644 --- a/param.h +++ b/param.h @@ -3057,7 +3057,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_MN 16 +#define CGEMM_DEFAULT_UNROLL_MN 32 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 4 From ff41cf5c49bda6bf84950812e737b2e7bcddf139 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 17 Mar 2023 14:28:26 +0300 Subject: [PATCH 24/36] Fix BLAS, BLAS-like functions and Generic RISC-V kernels * Fixed gemmt, imatcopy, zimatcopy_cnc functions * Fixed cblas_cscal testing in ctest * Removed rotmg unreacheble code * Added zero size checks --- cblas.h | 8 ++ common_interface.h | 9 ++ ctest/c_cblat1.f | 10 +- ctest/c_cblat1c.c | 6 +- interface/gemmt.c | 233 ++++++++++++++++++++++----------- interface/imatcopy.c | 8 +- interface/rotmg.c | 28 +--- interface/zimatcopy.c | 6 +- kernel/generic/zimatcopy_cnc.c | 1 - kernel/riscv64/axpby.c | 2 +- kernel/riscv64/axpy.c | 2 +- kernel/riscv64/copy.c | 2 +- kernel/riscv64/dot.c | 2 +- kernel/riscv64/swap.c | 2 +- kernel/riscv64/zaxpy.c | 2 +- kernel/riscv64/zcopy.c | 2 +- kernel/riscv64/zswap.c | 2 +- 17 files changed, 201 insertions(+), 124 deletions(-) diff --git a/cblas.h b/cblas.h index c2bdd27fa4..f7d36788dd 100644 --- a/cblas.h +++ b/cblas.h @@ -289,6 +289,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); +void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); +void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); diff --git a/common_interface.h b/common_interface.h index 3188279208..61a82c306d 100644 --- a/common_interface.h +++ b/common_interface.h @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); + int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 1a123d74dc..cad7c7fa73 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -96,7 +96,7 @@ SUBROUTINE CHECK1(SFAC) INTEGER ICAMAXTEST EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST * .. External Subroutines .. - EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 + EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. @@ -214,8 +214,8 @@ SUBROUTINE CHECK1(SFAC) CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + STRUE4(NP1),SFAC) ELSE IF (ICASE.EQ.8) THEN -* .. CSCAL .. - CALL CSCAL(N,CA,CX,INCX) +* .. CSCALTEST .. + CALL CSCALTEST(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN @@ -236,14 +236,14 @@ SUBROUTINE CHECK1(SFAC) * INCX = 1 IF (ICASE.EQ.8) THEN -* CSCAL +* CSCALTEST * Add a test for alpha equal to zero. CA = (0.0E0,0.0E0) DO 80 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 80 CONTINUE - CALL CSCAL(5,CA,CX,INCX) + CALL CSCALTEST(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * CSSCALTEST diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index 8c0dd140cb..af29301afd 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -685,7 +685,7 @@ real *sfac; static integer i__; extern /* Subroutine */ int ctest_(); static complex mwpcs[5], mwpct[5]; - extern /* Subroutine */ int itest1_(), stest1_(); + extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); static complex cx[8]; extern real scnrm2test_(); static integer np1; @@ -727,7 +727,7 @@ real *sfac; stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); } else if (combla_1.icase == 8) { /* .. CSCAL .. */ - cscal_(&combla_1.n, &ca, cx, &combla_1.incx); + cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); } else if (combla_1.icase == 9) { @@ -761,7 +761,7 @@ real *sfac; mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; /* L80: */ } - cscal_(&c__5, &ca, cx, &combla_1.incx); + cscaltest_(&c__5, &ca, cx, &combla_1.incx); ctest_(&c__5, cx, mwpct, mwpcs, sfac); } else if (combla_1.icase == 9) { /* CSSCALTEST */ diff --git a/interface/gemmt.c b/interface/gemmt.c index 3eed1dfe49..a4530721ce 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -35,29 +35,26 @@ #include #include #include "common.h" -#ifdef FUNCTION_PROFILE -#include "functable.h" -#endif #ifndef COMPLEX #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE -#define ERROR_NAME "QGEMT " +#define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "DGEMT " +#define ERROR_NAME "DGEMMT " #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMT " +#define ERROR_NAME "SBGEMMT " #else -#define ERROR_NAME "SGEMT " +#define ERROR_NAME "SGEMMT " #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifdef XDOUBLE -#define ERROR_NAME "XGEMT " +#define ERROR_NAME "XGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "ZGEMT " +#define ERROR_NAME "ZGEMMT " #else -#define ERROR_NAME "CGEMT " +#define ERROR_NAME "CGEMMT " #endif #endif @@ -68,18 +65,22 @@ #ifndef CBLAS void NAME(char *UPLO, char *TRANSA, char *TRANSB, - blasint * M, blasint * N, blasint * K, + blasint * M, blasint * K, FLOAT * Alpha, IFLOAT * a, blasint * ldA, IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) { - blasint m, n, k; + blasint m, k; blasint lda, ldb, ldc; int transa, transb, uplo; blasint info; char transA, transB, Uplo; + blasint nrowa, nrowb; +#if defined(COMPLEX) + blasint ncolb; +#endif IFLOAT *buffer; IFLOAT *aa, *bb; FLOAT *cc; @@ -92,7 +93,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, PRINT_DEBUG_NAME; m = *M; - n = *N; k = *K; #if defined(COMPLEX) @@ -159,32 +159,47 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, if (Uplo == 'L') uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } + info = 0; - if (uplo < 0) - info = 14; - if (ldc < m) + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; - if (info) { + if (info != 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, - blasint N, blasint k, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, #ifndef COMPLEX FLOAT alpha, IFLOAT * A, blasint LDA, @@ -205,17 +220,23 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int transa, transb, uplo; blasint info; - blasint m, n, lda, ldb; + blasint lda, ldb; FLOAT *a, *b; +#if defined(COMPLEX) + blasint nrowb, ncolb; +#endif XFLOAT *buffer; PRINT_DEBUG_CNAME; + uplo = -1; transa = -1; transb = -1; info = 0; if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) transa = 0; @@ -248,9 +269,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, transb = 3; #endif - m = M; - n = N; - a = (void *)A; b = (void *)B; lda = LDA; @@ -258,23 +276,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint nrowa; +#if !defined(COMPLEX) + blasint nrowb; +#endif + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } + + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; } if (order == CblasRowMajor) { - m = N; - n = M; a = (void *)B; b = (void *)A; @@ -282,6 +319,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, lda = LDB; ldb = LDA; + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + if (TransB == CblasNoTrans) transa = 0; if (TransB == CblasTrans) @@ -315,29 +355,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint ncola; +#if !defined(COMPLEX) + blasint ncolb; +#endif + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; +#if defined(COMPLEX) + nrowb = m; +#endif + + if (transb & 1) { +#if defined(COMPLEX) + nrowb = k; +#endif + ncolb = m; + } + + if (ldc < MAX(1,m)) info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) info = 2; if (transa < 0) + info = 3; + if (uplo < 0) info = 1; - } - uplo = -1; - if (Uplo == CblasUpper) - uplo = 0; - if (Uplo == CblasLower) - uplo = 1; - if (uplo < 0) - info = 14; - if (info >= 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; @@ -407,37 +460,48 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif - if ((m == 0) || (n == 0)) + if (m == 0) return; IDEBUG_START; - FUNCTION_PROFILE_START(); +#if defined(COMPLEX) + if (transb > 1){ +#ifndef CBLAS + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#else + if (order == CblasColMajor) + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); + if (order == CblasRowMajor) + IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#endif + } +#endif - const blasint incb = (transb == 0) ? 1 : ldb; + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; if (uplo == 1) { - for (i = 0; i < n; i++) { - j = n - i; + for (i = 0; i < m; i++) { + j = m - i; l = j; #if defined(COMPLEX) aa = a + i * 2; bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i * 2; - bb = b + i * 2; } + if (transb & 1) + bb = b + i * 2; cc = c + i * 2 * ldc + i * 2; #else aa = a + i; bb = b + i * ldb; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i; - bb = b + i; } + if (transb & 1) + bb = b + i; cc = c + i * ldc + i; #endif @@ -447,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -458,8 +522,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -479,20 +541,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, aa, lda, bb, incb, cc, 1, buffer, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); } #endif @@ -501,21 +577,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } } else { - for (i = 0; i < n; i++) { + for (i = 0; i < m; i++) { j = i + 1; l = j; #if defined COMPLEX bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transb & 1) { bb = b + i * 2; } cc = c + i * 2 * ldc; #else bb = b + i * ldb; - if (transa) { - l = k; + if (transb & 1) { bb = b + i; } cc = c + i * ldc; @@ -527,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -537,8 +611,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -558,32 +630,41 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, a, lda, bb, incb, cc, 1, buffer, nthreads); - + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); } #endif STACK_FREE(buffer); } } - FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, - args.m * args.k + args.k * args.n + - args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END; return; -} +} \ No newline at end of file diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 91975f7f41..109280fe69 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -149,10 +149,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); - else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); + else + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/rotmg.c b/interface/rotmg.c index 3a5ca8f95a..b8f627221d 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ else { dp2 = *dd2 * dy1; - if(dp2 == ZERO) - { - dflag = -TWO; - dparam[0] = dflag; - return; - } dp1 = *dd1 * *dx1; dq2 = dp2 * dy1; dq1 = dp1 * *dx1; @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dh12 = dp2 / dp1; du = ONE - dh12 * dh21; - if(du > ZERO) - { - dflag = ZERO; - *dd1 = *dd1 / du; - *dd2 = *dd2 / du; - *dx1 = *dx1 * du; - } else { - dflag = -ONE; - - dh11 = ZERO; - dh12 = ZERO; - dh21 = ZERO; - dh22 = ZERO; - - *dd1 = ZERO; - *dd2 = ZERO; - *dx1 = ZERO; - } + dflag = ZERO; + *dd1 = *dd1 / du; + *dd2 = *dd2 / du; + *dx1 = *dx1 * du; } else diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index ecda5ef4ec..7d73ba5720 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -171,10 +171,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, } #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) diff --git a/kernel/generic/zimatcopy_cnc.c b/kernel/generic/zimatcopy_cnc.c index 8e772bd8ab..6426cffc09 100644 --- a/kernel/generic/zimatcopy_cnc.c +++ b/kernel/generic/zimatcopy_cnc.c @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); - if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); aptr = a; lda *= 2; diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c index 278747f755..04f9518d31 100644 --- a/kernel/riscv64/axpby.c +++ b/kernel/riscv64/axpby.c @@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); ix = 0; iy = 0; diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c index fb1094dd9a..19d12ad3fe 100644 --- a/kernel/riscv64/axpy.c +++ b/kernel/riscv64/axpy.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c index 7b4f04f301..e79ca59aff 100644 --- a/kernel/riscv64/copy.c +++ b/kernel/riscv64/copy.c @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG i=0; BLASLONG ix=0,iy=0; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); while(i < n) { diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index 46a84ad189..bf55998ca9 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; - if ( n < 0 ) return(dot); + if ( n < 1 ) return(dot); while(i < n) { diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c index eac621fb2b..33bbeeb6ac 100644 --- a/kernel/riscv64/swap.c +++ b/kernel/riscv64/swap.c @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG ix=0,iy=0; FLOAT temp; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); while(i < n) { diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c index 1dcaeac272..18b6315cbc 100644 --- a/kernel/riscv64/zaxpy.c +++ b/kernel/riscv64/zaxpy.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c index 07fe584c57..b0f19efd5d 100644 --- a/kernel/riscv64/zcopy.c +++ b/kernel/riscv64/zcopy.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c index ae4760ae0d..df1402b94f 100644 --- a/kernel/riscv64/zswap.c +++ b/kernel/riscv64/zswap.c @@ -45,7 +45,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; From 5b4df851d7581145f0aee4336f11127a3a7acc8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Mar 2023 08:29:05 +0100 Subject: [PATCH 25/36] fix stray blank on continuation line --- interface/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 6f320d8f7b..a4d3f710aa 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -92,8 +92,9 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) \ - cspr2.$(SUFFIX) \ + csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + cspr.$(SUFFIX) cspr2.$(SUFFIX) \ + csymv.$(SUFFIX) csyr.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ From 1c04df20bd9c845160b3eb2e51adaceb6f93cf8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 20 Mar 2023 23:04:12 +0100 Subject: [PATCH 26/36] Re-enable overriding the LAPACK SYMV,SYR,SPMV and SPR implementations --- lapack-netlib/SRC/Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 49798b0c5d..5f22789bd6 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -572,22 +572,26 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o strtrs.o + slauum.o strti2.o strtri.o strtrs.o \ + ssymv.o ssyr.o sspmv.o sspr.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o dtrtrs.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o \ + dsymv.o dsyr.o dspmv.o dspr.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o ctrtrs.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o \ + csymv.o csyr.o cspmv.o cspr.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o ztrtrs.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o \ + zsymv.o zsyr.o zspmv.o zspr.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) From 5222b5fc18829265be7ffc77e77271a18f17c005 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 12 Oct 2023 22:06:00 +0300 Subject: [PATCH 27/36] Added axpby kernels for GENERIC RISC-V target --- kernel/riscv64/KERNEL.RISCV64_GENERIC | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 61a8a2b918..15bcd2289e 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c CAXPYKERNEL = ../riscv64/zaxpy.c ZAXPYKERNEL = ../riscv64/zaxpy.c +SAXPBYKERNEL = ../riscv64/axpby.c +DAXPBYKERNEL = ../riscv64/axpby.c +CAXPBYKERNEL = ../riscv64/zaxpby.c +ZAXPBYKERNEL = ../riscv64/zaxpby.c + SCOPYKERNEL = ../riscv64/copy.c DCOPYKERNEL = ../riscv64/copy.c CCOPYKERNEL = ../riscv64/zcopy.c From f1291614536d7d1bec6508fda9b0c56dd7286bb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Mar 2023 07:43:03 +0100 Subject: [PATCH 28/36] restore C/Z SPMV, SPR, SYR,SYMV --- interface/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index a4d3f710aa..3db4b2b6d4 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -122,8 +122,9 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) \ - zspr2.$(SUFFIX) \ + zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zspr.$(SUFFIX) zspr2.$(SUFFIX) \ + zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ From 85548e66ca25228a73ec08c257d5d92108b94b62 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Mar 2023 16:33:09 +0200 Subject: [PATCH 29/36] Fix build failures seen with the NO_LAPACK option - cspr/csymv/csyr belong on the LAPACK list --- interface/Makefile | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 3db4b2b6d4..6f320d8f7b 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -92,9 +92,8 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ - cspr.$(SUFFIX) cspr2.$(SUFFIX) \ - csymv.$(SUFFIX) csyr.$(SUFFIX) \ + csbmv.$(SUFFIX) \ + cspr2.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ @@ -122,9 +121,8 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ - zspr.$(SUFFIX) zspr2.$(SUFFIX) \ - zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ + zsbmv.$(SUFFIX) \ + zspr2.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ From f7cf637d7aad0990625f41f83db74446a5908509 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Mar 2023 18:31:04 +0200 Subject: [PATCH 30/36] redo lost edit --- interface/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 6f320d8f7b..275b71a1c2 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -445,7 +445,8 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \ + cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX) #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ @@ -456,8 +457,8 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) - + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \ + zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX) XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ From f89e0034a479016ab5d9e1681abf07dab7f8cf38 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Wed, 20 Dec 2023 21:20:30 +0300 Subject: [PATCH 31/36] Fix LAPACK usage from BLAS --- interface/Makefile | 7 +++---- lapack-netlib/SRC/Makefile | 12 ++++-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 275b71a1c2..6f320d8f7b 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -445,8 +445,7 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \ - cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ @@ -457,8 +456,8 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \ - zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX) + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) + XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 5f22789bd6..49798b0c5d 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -572,26 +572,22 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o strtrs.o \ - ssymv.o ssyr.o sspmv.o sspr.o + slauum.o strti2.o strtri.o strtrs.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o dtrtrs.o \ - dsymv.o dsyr.o dspmv.o dspr.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o ctrtrs.o \ - csymv.o csyr.o cspmv.o cspr.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o ztrtrs.o \ - zsymv.o zsyr.o zspmv.o zspr.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) From ccbc3f875bc87e92c1ab05f3b361f64d7fd95c87 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Fri, 19 Jan 2024 12:40:00 +0000 Subject: [PATCH 32/36] [RISC-V] Add RISCV64_ZVL128B target to common_riscv64.h --- common_riscv64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_riscv64.h b/common_riscv64.h index f11e8b75d4..4b5f7dcc47 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) # include #endif From e1afb23811256b231c259ca57d7a5f6e81ac6da5 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 7 Apr 2023 11:13:23 +0300 Subject: [PATCH 33/36] Fix BLAS and LAPACK tests for C910V and RISCV64_ZVL256B targets * Fixed bugs in dgemm, [a]min\max, asum kernels * Added zero checks for BLAS kernels * Added dsdot implementation for RVV 0.7.1 * Fixed bugs in _vector files for C910V and RISCV64_ZVL256B targets * Added additional definitions for RISCV64_ZVL256B target --- Makefile.prebuild | 4 + Makefile.riscv64 | 4 + TargetList.txt | 1 + getarch.c | 14 +++ kernel/riscv64/KERNEL.C910V | 1 + kernel/riscv64/amin_vector.c | 6 +- kernel/riscv64/asum_vector.c | 7 +- kernel/riscv64/axpby_vector.c | 2 +- kernel/riscv64/dgemm_kernel_8x4_c910v.c | 2 +- kernel/riscv64/dsdot_vector.c | 152 ++++++++++++++++++++++++ kernel/riscv64/iamin_vector.c | 4 +- kernel/riscv64/izamin_vector.c | 2 +- kernel/riscv64/nrm2_vector.c | 2 +- kernel/riscv64/nrm2_vector_dot.c | 2 +- kernel/riscv64/swap_vector.c | 2 +- kernel/riscv64/zamax_vector.c | 17 +-- kernel/riscv64/zamin_vector.c | 17 +-- kernel/riscv64/znrm2_vector.c | 2 +- kernel/riscv64/zswap_vector.c | 2 +- 19 files changed, 205 insertions(+), 38 deletions(-) create mode 100644 kernel/riscv64/dsdot_vector.c diff --git a/Makefile.prebuild b/Makefile.prebuild index c4f4a26026..d30275f062 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif +ifeq ($(TARGET), RISCV64_ZVL256B) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +endif + ifeq ($(TARGET), RISCV64_GENERIC) TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce7a271412..2239a3676f 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +ifeq ($(CORE), RISCV64_ZVL256B) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static diff --git a/TargetList.txt b/TargetList.txt index f76f605cc3..f65a18b505 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -121,6 +121,7 @@ Z14 RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) C910V x280 +RISCV64_ZVL256B 11.LOONGARCH64: LOONGSONGENERIC diff --git a/getarch.c b/getarch.c index 7728363472..12ea720522 100644 --- a/getarch.c +++ b/getarch.c @@ -1692,6 +1692,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_ZVL256B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL256B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL256B " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl256b" +#define CORENAME "RISCV64_ZVL256B" +#else +#endif + #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 0da66fa359..2798a870ed 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = dsdot_vector.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 1c541f0fd1..c4578eabf9 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -31,15 +31,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # define LMUL m2 # if defined(DOUBLE) # define ELEN 64 +# define ABS fabs # else # define ELEN 32 +# define ABS fabsf # endif #else # define LMUL m8 # if defined(DOUBLE) # define ELEN 64 +# define ABS fabs # else # define ELEN 32 +# define ABS fabsf # endif #endif @@ -69,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=0.0; if (n <= 0 || inc_x <= 0) return(minf); - minf = *x; + minf = ABS(*x); x += inc_x; --n; if (n == 0) return(minf); diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index 995dbf9a13..a652eafdd4 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -67,7 +67,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; - BLASLONG ix=0; FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; @@ -103,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); - BLASLONG inc_xv = inc_x * gvl; for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + }else if(inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else if(inc_x == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else{ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + } + return(dot); +} diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index a588729602..0e591e6975 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -139,7 +139,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); @@ -185,7 +185,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index a3877a46c2..c76a38099c 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -156,7 +156,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c index 141dffebfd..5c03fbec7a 100644 --- a/kernel/riscv64/nrm2_vector.c +++ b/kernel/riscv64/nrm2_vector.c @@ -104,7 +104,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; - if(n <= 0) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); if(n == 1) return (ABS(x[0])); unsigned int gvl = 0; diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c index 06e61d6959..dfa13a6f59 100644 --- a/kernel/riscv64/nrm2_vector_dot.c +++ b/kernel/riscv64/nrm2_vector_dot.c @@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG i=0, j=0; double len = 0.0 ; - if ( n < 0 ) return(0.0); + if ( n <= 0 ) return(0.0); if(n == 1) return (ABS(x[0])); FLOAT_V_T vr, v0, v1; diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index 3b467a5868..f583f53923 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -67,7 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - if (n < 0) return(0); + if (n <= 0) return(0); unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 2dee5ab29a..ec4a5a1e95 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -60,17 +60,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) #ifdef RISCV_0p10_INTRINSICS #define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) -#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) #else #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) #endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) #define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -91,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for(; i Date: Wed, 24 Jan 2024 10:53:13 +0300 Subject: [PATCH 34/36] Fix x280 taget include riscv_vector.h --- common_riscv64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_riscv64.h b/common_riscv64.h index 4b5f7dcc47..ab3bfa25a1 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) # include #endif From 73530b03fa6ecd03e7ceb2b37c234a0bb1626445 Mon Sep 17 00:00:00 2001 From: Andrey Sokolov Date: Wed, 24 Jan 2024 11:38:14 +0300 Subject: [PATCH 35/36] remove RISCV64_ZVL256B additional extentions --- Makefile.prebuild | 2 +- Makefile.riscv64 | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index 7824e15a87..98acca80e5 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -60,7 +60,7 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif ifeq ($(TARGET), RISCV64_ZVL256B) -TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d endif ifeq ($(TARGET), RISCV64_ZVL128B) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 9d314d0740..113cc57c53 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -7,8 +7,8 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL256B) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL128B) CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d From a3b0ef6596d51ecfb59b0a2f6a7b0d59bc4f18b4 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Thu, 1 Feb 2024 10:26:02 +0000 Subject: [PATCH 36/36] Restore riscv64 fixes from develop branch: dot product double precision accumulation, zscal NaN handling --- Makefile.prebuild | 1 + kernel/riscv64/dot.c | 10 ++++ kernel/riscv64/zscal_rvv.c | 90 ++++++----------------------------- kernel/riscv64/zscal_vector.c | 79 +----------------------------- 4 files changed, 26 insertions(+), 154 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index b44b50039c..b7d695a750 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -57,6 +57,7 @@ endif ifeq ($(TARGET), CK860FV) TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index bf55998ca9..8ad493a2b4 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + +#if defined(DSDOT) double dot = 0.0 ; +#else + FLOAT dot = 0.0 ; +#endif if ( n < 1 ) return(dot); while(i < n) { +#if defined(DSDOT) + dot += (double) y[iy] * (double) x[ix] ; +#else dot += y[iy] * x[ix] ; +#endif + ix += inc_x ; iy += inc_y ; i++ ; diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 2586c60366..ae79d9f9d9 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -69,49 +69,26 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F size_t vlmax = VSETVL_MAX; FLOAT_VX2_T vx2; - if(da_r == 0.0 && da_i == 0.0) { + if(inc_x == 1) { - vr = VFMVVF_FLOAT(0.0, vlmax); - vi = VFMVVF_FLOAT(0.0, vlmax); - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - } - - } else if(da_r == 0.0) { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx2 = VLSEG_FLOAT(x, vl); vr = VGET_VX2(vx2, 0); vi = VGET_VX2(vx2, 1); - vt = VFMULVF_FLOAT(vi, -da_i, vl); - vi = VFMULVF_FLOAT(vr, da_i, vl); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); - - VSSSEG_FLOAT(x, stride_x, vx2, vl); + VSSEG_FLOAT(x, vx2, vl); } - } else if(da_i == 0.0) { + } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); @@ -120,54 +97,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vr = VGET_VX2(vx2, 0); vi = VGET_VX2(vx2, 1); - vr = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); VSSSEG_FLOAT(x, stride_x, vx2, vl); } - - } else { - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - - vx2 = VLSEG_FLOAT(x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - } } return(0); diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 2034aafaae..536bbdf736 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(0.0, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - }else if(da_r == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else if(da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else{ + { gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl;