diff --git a/.cirrus.yml b/.cirrus.yml index 7a13d123..d0e1eeff 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -41,7 +41,7 @@ macos_instance: # - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest task: name: AppleM1/LLVM x86_64 xbuild compile_script: @@ -58,8 +58,8 @@ task: - export VALID_ARCHS="i386 x86_64" - xcrun --sdk macosx --show-sdk-path - xcodebuild -version - - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64" + - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: @@ -70,7 +70,7 @@ task: # type: application/octet-streamm macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest task: name: AppleM1/LLVM armv8-ios xbuild compile_script: @@ -78,8 +78,10 @@ task: - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0" + - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" + - xcrun --sdk iphoneos --show-sdk-path + - ls -l /Applications - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 always: config_artifacts: @@ -96,11 +98,11 @@ task: - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - ls /System/Volumes/Data/opt/homebrew - - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/ + - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib" - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: @@ -132,7 +134,7 @@ task: FreeBSD_task: name: FreeBSD-gcc12 freebsd_instance: - image_family: freebsd-13-2 + image_family: freebsd-13-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -143,7 +145,7 @@ FreeBSD_task: FreeBSD_task: name: freebsd-gcc12-ilp64 freebsd_instance: - image_family: freebsd-13-2 + image_family: freebsd-13-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -153,10 +155,10 @@ FreeBSD_task: FreeBSD_task: name: FreeBSD-clang-openmp freebsd_instance: - image_family: freebsd-13-2 + image_family: freebsd-13-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc - - ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so + - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so compile_script: - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1 diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 68ba2ddd..a47ca1dc 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -84,6 +84,7 @@ jobs: run: | export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH qemu-riscv64 ./utest/openblas_utest + qemu-riscv64 ./utest/openblas_utest_ext OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml new file mode 100644 index 00000000..25e196ef --- /dev/null +++ b/.github/workflows/codspeed-bench.yml @@ -0,0 +1,157 @@ +name: Run codspeed benchmarks + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + benchmarks: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + fortran: [gfortran] + build: [make] + pyver: ["3.12"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.pyver }} + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update + sudo apt-get install -y gfortran cmake ccache libtinfo5 + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Write out the .pc + run: | + cd benchmark/pybench + cat > openblas.pc << EOF + libdir=${{ github.workspace }} + includedir= ${{ github.workspace }} + openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64 + version=0.0.99 + extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas + Name: openblas + Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version + Version: ${version} + URL: https://github.com/xianyi/OpenBLAS + Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }} + Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas + Cflags: -I${{ github.workspace}} + EOF + cat openblas.pc + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + elif [ "$RUNNER_OS" = "macOS" ]; then + echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Install benchmark dependencies + run: pip install meson ninja numpy pytest pytest-codspeed --user + + - name: Build the wrapper + run: | + cd benchmark/pybench + export PKG_CONFIG_PATH=$PWD + meson setup build --prefix=$PWD/build-install + meson install -C build + # + # sanity check + cd build/openblas_wrap + python -c'import _flapack; print(dir(_flapack))' + + - name: Run benchmarks under pytest-benchmark + run: | + cd benchmark/pybench + pip install pytest-benchmark + export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/ + OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd' + + - name: Run benchmarks + uses: CodSpeedHQ/action@v2 + with: + token: ${{ secrets.CODSPEED_TOKEN }} + run: | + cd benchmark/pybench + export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/ + OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed + diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..da40b853 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,40 @@ +name: Publish docs via GitHub Pages + +on: + push: + branches: + - develop + pull_request: + branches: + - develop + +jobs: + build: + name: Deploy docs + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install MkDocs and doc theme packages + run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin + + - name: Build docs site + run: mkdocs build + + # mkdocs gh-deploy command only builds to the top-level, hence deploying + # with this action instead. + # Deploys to http://www.openmathlib.org/OpenBLAS/docs/ + - name: Deploy docs + uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 + if: ${{ github.ref == 'refs/heads/develop' }} + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./site + destination_dir: docs/ diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index f1bf8064..da7f6c9a 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -33,10 +33,8 @@ jobs: - name: Install APT deps run: | - sudo add-apt-repository ppa:savoury1/virtualisation sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ - qemu-user-static + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache - name: Download and install loongarch64-toolchain run: | @@ -44,6 +42,20 @@ jobs: #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt + - name: Checkout qemu + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: master + + - name: Install qemu + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static + make -j$(nproc) + make install + - name: Set env run: | echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -76,44 +88,46 @@ jobs: - name: Test run: | - qemu-loongarch64-static ./utest/openblas_utest - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-loongarch64 ./utest/openblas_utest + qemu-loongarch64 ./utest/openblas_utest_ext + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1 rm -f ./test/?BLAT2.SUMM - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat rm -f ./test/?BLAT2.SUMM - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat rm -f ./test/?BLAT3.SUMM - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat rm -f ./test/?BLAT3.SUMM - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml new file mode 100644 index 00000000..d08e56f6 --- /dev/null +++ b/.github/workflows/loongarch64_clang.yml @@ -0,0 +1,135 @@ +name: loongarch64 clang qemu test + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: LOONGSONGENERIC + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC + - target: LOONGSON3R5 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 + - target: LOONGSON2K1000 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 + - target: DYNAMIC_ARCH + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install libffi6 + run: | + wget http://ftp.ca.debian.org/debian/pool/main/libf/libffi/libffi6_3.2.1-9_amd64.deb + sudo dpkg -i libffi6_3.2.1-9_amd64.deb + + - name: Install APT deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache + + - name: Download and install loongarch64-toolchain + run: | + wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz + wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz + tar -xf clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz -C /opt + tar -xf loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz -C /opt + + - name: Checkout qemu + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: master + + - name: Install qemu + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static + make -j$(nproc) + make install + + - name: Set env + run: | + echo "PATH=$GITHUB_WORKSPACE:/opt/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10/bin:/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/bin:$PATH" >> $GITHUB_ENV + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Disable utest dsdot:dsdot_n_1 + run: | + echo -n > utest/test_dsdot.c + echo "Due to the qemu versions 7.2 causing utest cases to fail," + echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." + + - name: Build OpenBLAS + run: make CC='ccache clang --target=loongarch64-linux-gnu --sysroot=/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/loongarch64-linux-gnu/sysroot/ -static' FC='ccache loongarch64-linux-gnu-gfortran -static' HOSTCC='ccache clang' CROSS_SUFFIX=llvm- NO_SHARED=1 ${{ matrix.opts }} -j$(nproc) + + - name: Test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-loongarch64 ./utest/openblas_utest + qemu-loongarch64 ./utest/openblas_utest_ext + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat + diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index 4686ba71..1491aff7 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -80,6 +80,7 @@ jobs: run: | export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH qemu-mips64el ./utest/openblas_utest + qemu-mips64el ./utest/openblas_utest_ext OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1 OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1 OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1 diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml index dd6fe9ca..9209ebb7 100644 --- a/.github/workflows/riscv64_vector.yml +++ b/.github/workflows/riscv64_vector.yml @@ -28,6 +28,9 @@ jobs: - target: RISCV64_ZVL256B opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 + - target: DYNAMIC_ARCH=1 + opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 steps: - name: Checkout repository diff --git a/.gitignore b/.gitignore index dc6804f1..8294da4d 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ benchmark/smallscaling CMakeCache.txt CMakeFiles/* .vscode +**/__pycache__ diff --git a/CMakeLists.txt b/CMakeLists.txt index 096ca88b..20066041 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 27) +set(OpenBLAS_PATCH_VERSION 28) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") @@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) +set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") + option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) @@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) -option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) @@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() +if (APPLE AND BUILD_SHARED_LIBS) +set(CMAKE_MACOSX_RPATH ON) +endif() + # Seems that this hack doesn't required since macOS 11 Big Sur if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 20332082..d885a01b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -198,6 +198,9 @@ In chronological order: * PingTouGe Semiconductor Co., Ltd. * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 +* Jake Arkinstall + * [2021-02-10] Remove in-source configure_file to enable builds in read-only contexts (issue #3100, PR #3101) + * River Dillon * [2021-07-10] fix compilation with musl libc diff --git a/Changelog.txt b/Changelog.txt index 03c3cfbd..7f89a2ea 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,127 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.28 + 8-Aug-2024 + +general: +- Reworked the unfinished implementation of HUGETLB from GotoBLAS + for allocating huge memory pages as buffers on suitable systems +- Changed the unfinished implementation of GEMM3M for the generic + target on all architectures to at least forward to regular GEMM +- Improved multithreaded GEMM performance for large non-skinny matrices +- Improved BLAS3 performance on larger multicore systems through improved + parallelism +- Improved performance of the initial memory allocation by reducing + locking overhead +- Improved performance of GBMV at small problem sizes by introducing + a size barrier for the switch to multithreading +- Added an implementation of the CBLAS_GEMM_BATCH extension +- Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in + CMAKE builds (error introduced in 0.3.27) +- Fixed corner cases involving the handling of NAN and INFINITY + arguments in ?SCAL on all architectures +- Added support for cross-compiling to WEBM with CMAKE (in addition + to the already present makefile support) +- Fixed NAN handling and potential accuracy issues in compilations with + Intel ICX by supplying a suitable fp-model option by default +- The contents of the github project wiki have been converted into + a new set of documentation included with the source code. +- It is now possible to register a callback function that replaces + the built-in support for multithreading with an external backend + like TBB (openblas_set_threads_callback_function) +- Fixed potential duplication of suffixes in shared library naming +- Improved C compiler detection by the build system to tolerate more + naming variants for gcc builds +- Fixed an unnecessary dependency of the utest on CBLAS +- Fixed spurious error reports from the BLAS extensions utest +- Fixed unwanted invocation of the GEMM3M tests in cross-compilation +- Fixed a flaw in the makefile build that could lead to the pkgconfig + file containing an entry of UNKNOWN for the target cpu after installing +- Integrated fixes from the Reference-LAPACK project: + - Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961) + - Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018) + - Fixed potential infinite loop in the LAPACK testsuite (PR 1024) + - Make the variable type used for hidden length arguments configurable (PR 1025) + - Fixed SYTRD workspace computation and various typos (PR 1030) + - Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033) + +x86-64: +- reverted thread management under Windows to its state before 0.3.26 + due to signs of race conditions in some circumstances now under study +- fixed accidental selection of the unoptimized generic SBGEMM kernel + in CMAKE builds for CooperLake and SapphireRapids targets +- fixed a potential thread buffer overrun in SBSTOBF16 on small systems +- fixed an accuracy issue in ZSCAL introduced in 0.3.26 +- fixed compilation with CMAKE and recent releases of LLVM +- added support for Intel Emerald Rapids and Meteor Lake cpus +- added autodetection support for the Zhaoxin KX-7000 cpu +- fixed autodetection of Intel Prescott (probably broken since 0.3.19) +- fixed compilation for older targets with the Yocto SDK +- fixed compilation of the converter-generated C versions + of the LAPACK sources with gcc-14 +- improved compiler options when building with CMAKE and LLVM for + AVX512-capable targets +- added support for supplying the L2 cache size via an environment + variable (OPENBLAS_L2_SIZE) in case it is not correctly reported + (as in some VM configurations) +- improved the error message shown when thread creation fails on startup +- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS + +arm: +- fixed building for baremetal targets with make + +arm64: +- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 + matrix to the corresponding GEMV kernel +- added optimized SGEMV and DGEMV kernels for A64FX +- added optimized SVE kernels for small-matrix GEMM +- added A64FX to the cpu list for DYNAMIC_ARCH +- fixed building with support for cpu affinity +- worked around accuracy problems with C/ZNRM2 on NeoverseN1 and + Apple M targets +- improved GEMM performance on Neoverse V1 +- fixed compilation for NEOVERSEN2 with older compilers +- fixed potential miscompilation of the SVE SDOT and DDOT kernels +- fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels +- fixed a potential overflow when using very large user-defined BUFFERSIZE +- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS + +power: +- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 + matrix to the corresponding GEMV kernel +- significantly improved performance of SBGEMM on POWER10 +- fixed compilation with OpenMP and the XLF compiler +- fixed building of the BLAS extension utests under AIX +- fixed building of parts of the LAPACK testsuite with XLF +- fixed CSWAP/ZSWAP on big-endian POWER10 targets +- fixed a performance regression in SAXPY on POWER10 with OpenXL +- fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM +- fixed building for POWER9 under FreeBSD +- fixed a potential overflow when using very large user-defined BUFFERSIZE +- fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV + +riscv64: +- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 + matrix to the corresponding GEMV kernel +- fixed building for RISCV64_GENERIC with OpenMP enabled +- added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two + RVV 1.0 targets with vector length of 128 and 256) +- worked around the ZVL128B kernels for AXPBY mishandling the special + case of zero Y increment + +loongarch64: +- improved GEMM performance on servers of the 3C5000 generation +- improved performance and stability of DGEMM +- improved GEMV and TRSM kernels for LSX and LASX vector ABIs +- fixed CMAKE compilation with the INTERFACE64 option set +- fixed compilation with CMAKE +- worked around spurious errors flagged by the BLAS3 tests +- worked around a miscompilation of the POTRS utest by gcc 14.1 + +mips64: +- fixed ASUM and SUM kernels to accept negative step sizes in X +- fixed complex GEMV kernels for MSA + ==================================================================== Version 0.3.27 4-Apr-2024 diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr index 96e18b8a..b2f8ce2e 100644 --- a/Jenkinsfile.pwr +++ b/Jenkinsfile.pwr @@ -1,7 +1,7 @@ pipeline { agent { docker { - image 'osuosl/ubuntu-ppc64le' + image 'osuosl/ubuntu-ppc64le:18.04' } } stages { diff --git a/Makefile b/Makefile index 76b4f228..5fd0f919 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,10 @@ else LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) endif +ifdef LAPACK_STRLEN +LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN) +endif + SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test .PHONY : all libs netlib $(RELA) test ctest shared install diff --git a/Makefile.arm64 b/Makefile.arm64 index eeb72873..fccc0d0d 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -145,13 +145,13 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else -CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72 endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a+sve +CCOMMON_OPT += -march=armv8.5-a+sve+bf16 ifneq ($(CROSS), 1) CCOMMON_OPT += -mtune=native endif @@ -163,19 +163,29 @@ endif endif endif else -CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve+bf16 -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif endif endif +# Detect ARM Neoverse V2. +ifeq ($(CORE), NEOVERSEV2) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG))) +CCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2 +endif +endif +endif + # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) @@ -266,12 +276,19 @@ endif endif endif -ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) ifeq ($(CORE), A64FX) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ3) $(GCCVERSIONGTEQ11) $(ISCLANG))) CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx endif +else +CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-n1 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-n1 +endif +endif endif endif diff --git a/Makefile.install b/Makefile.install index 01be0d99..9b38f276 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,18 +72,18 @@ ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @cp cblas.h cblas.tmp ifdef SYMBOLPREFIX - @sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 - @sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^() ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^() ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp #change back any openblas_complex_float and double that got hit @sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2 - @sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp + @sed 's/goto[^() ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp endif ifdef SYMBOLSUFFIX - @sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 - @sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^() ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^() ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp #change back any openblas_complex_float and double that got hit @sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2 - @sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp + @sed 's/goto[^() ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp endif @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif @@ -169,7 +169,7 @@ endif @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" - @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" + @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" @echo 'version='$(VERSION) >> "$(PKGFILE)" @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" @cat openblas.pc.in >> "$(PKGFILE)" diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 113cc57c..9f6e48b7 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -8,13 +8,13 @@ FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL256B) CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d endif ifeq ($(CORE), RISCV64_ZVL128B) CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.rule b/Makefile.rule index e920ea48..ac62d49c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.27 +VERSION = 0.3.28 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library @@ -134,6 +134,12 @@ VERSION = 0.3.27 # Build LAPACK Deprecated functions since LAPACK 3.6.0 BUILD_LAPACK_DEPRECATED = 1 +# The variable type assumed for the length of character arguments when passing +# data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC +# versions used "int"). Mismatches will not cause runtime failures but may result +# in build warnings or errors when building with link-time optimization (LTO) +# LAPACK_STRLEN=int + # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 # Have RecursiveLAPACK actually replace standard LAPACK routines instead of @@ -173,6 +179,10 @@ NO_AFFINITY = 1 # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 +# If you are compiling for an embedded system ("bare metal") like Cortex M series +# Note that you will have to provide implementations of malloc() and free() in this case +# EMBEDDED = 1 + # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. # NO_AVX = 1 @@ -215,6 +225,16 @@ NO_AFFINITY = 1 # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 +# Use large page allocation (called hugepage support in Linux context) +# for the thread buffers (with access by shared memory operations) +# HUGETLB_ALLOCATION = 1 + +# Use large page allocation called hugepages in Linux) based on mmap accessing +# a memory-backed pseudofile (requires hugetlbfs to be mounted in the system, +# the example below has it mounted on /hugepages. OpenBLAS will create the backing +# file as gotoblas.processid in that path) +# HUGETLBFILE_ALLOCATION = /hugepages + # If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). # CONSISTENT_FPCSR = 1 diff --git a/Makefile.system b/Makefile.system index 312badf8..a7d9b197 100644 --- a/Makefile.system +++ b/Makefile.system @@ -266,10 +266,30 @@ SMALL_MATRIX_OPT = 1 else ifeq ($(ARCH), power) SMALL_MATRIX_OPT = 1 BUILD_BFLOAT16 = 1 +else ifeq ($(ARCH), arm64) +SMALL_MATRIX_OPT = 1 +endif +ifeq ($(ARCH), loongarch64) +SMALL_MATRIX_OPT = 1 +endif +ifeq ($(ARCH), arm64) +GEMM_GEMV_FORWARD = 1 +endif +ifeq ($(ARCH), riscv) +GEMM_GEMV_FORWARD = 1 +endif +ifeq ($(ARCH), power) +GEMM_GEMV_FORWARD = 1 endif + ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif +ifeq ($(GEMM_GEMV_FORWARD), 1) +ifneq ($(ONLY_CBLAS), 1) +CCOMMON_OPT += -DGEMM_GEMV_FORWARD +endif +endif # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE @@ -354,6 +374,9 @@ OBJCONV = $(CROSS_SUFFIX)objconv ifeq ($(NOFORTRAN), 1) C_LAPACK = 1 override FEXTRALIB = +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -Wno-error=incompatible-pointer-types +endif endif ifeq ($(C_COMPILER), GCC) @@ -681,6 +704,7 @@ ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE +DYNAMIC_CORE += A64FX endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 @@ -707,6 +731,17 @@ ifeq ($(ARCH), loongarch64) DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC endif +ifeq ($(ARCH), riscv64) +DYNAMIC_CORE = RISCV64_GENERIC +DYNAMIC_CORE += RISCV64_ZVL128B +DYNAMIC_CORE += RISCV64_ZVL256B +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = RISCV64_GENERIC $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_RISCV64_GENERIC +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +endif +endif + ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC @@ -809,8 +844,12 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 +ifneq ($(EMBEDDED), 1) CCOMMON_OPT += -marm FCOMMON_OPT += -marm +else +CCOMMON_OPT += -DOS_EMBEDDED -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 +endif # If softfp abi is mentioned on the command line, force it. ifeq ($(ARM_SOFTFP_ABI), 1) @@ -953,12 +992,18 @@ endif ifeq ($(ARCH), loongarch64) LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) +LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64) ifneq ($(LA64_ABI), lp64d) LA64_ABI=lp64 endif +ifneq ($(LA64_ARCH), loongarch64) +CCOMMON_OPT += -mabi=$(LA64_ABI) +FCOMMON_OPT += -mabi=$(LA64_ABI) +else CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) endif +endif endif @@ -1193,9 +1238,6 @@ endif else FCOMMON_OPT += -q32 endif -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp -endif endif ifeq ($(F_COMPILER), PGI) @@ -1580,13 +1622,23 @@ ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE endif +ifdef SHMEM_ALLOCATION +ifneq ($(SHMEM_ALLOCATION), 0) +CCOMMON_OPT += -DALLOC_SHM +endif +endif + ifdef HUGETLB_ALLOCATION +ifneq ($(HUGETLB_ALLOCATION), 0) CCOMMON_OPT += -DALLOC_HUGETLB endif +endif ifdef HUGETLBFILE_ALLOCATION +ifneq ($(HUGETLBFILE_ALLOCATION), 0) CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION) endif +endif ifdef STATIC_ALLOCATION CCOMMON_OPT += -DALLOC_STATIC diff --git a/Makefile.x86_64 b/Makefile.x86_64 index c0dbe84c..e9831327 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,11 @@ endif endif endif +ifeq ($(C_COMPILER), CLANG) +ifeq ($(findstring icx,$(CC)),icx) +CCOMMON_OPT += -fp-model=consistent +endif +endif ifneq ($(DYNAMIC_ARCH),1) ADD_CPUFLAGS = 1 diff --git a/README.md b/README.md index a37459b8..169087ce 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **AIX**: Dynamic architecture with OpenXL and OpenMP. ```sh - make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1 + make CC=ibm-clang_r FC=xlf_r TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1 ``` #### IBM zEnterprise System @@ -234,6 +234,8 @@ For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additi on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. +On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support. A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled. + The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3ae8615a..d72baabe 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -133,29 +133,29 @@ jobs: mkdir build cd build call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang-new -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. cmake --build . --config Release ctest - + ctest --rerun-failed --output-on-failure - job: OSX_OpenMP pool: - vmImage: 'macOS-11' + vmImage: 'macOS-12' steps: - script: | brew update - make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 - make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-13 FC=gfortran-13 + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-13 FC=gfortran-13 PREFIX=../blasinst install ls -lR ../blasinst - job: OSX_GCC_Nothreads pool: - vmImage: 'macOS-11' + vmImage: 'macOS-12' steps: - script: | brew update - make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 + make USE_THREADS=0 CC=gcc-13 FC=gfortran-13 - job: OSX_GCC12 pool: @@ -195,7 +195,7 @@ jobs: - job: OSX_dynarch_cmake pool: - vmImage: 'macOS-11' + vmImage: 'macOS-12' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -203,7 +203,7 @@ jobs: - script: | mkdir build cd build - cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. + cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_Fortran_COMPILER=gfortran-13 -DBUILD_SHARED_LIBS=ON .. cmake --build . ctest @@ -242,7 +242,7 @@ jobs: - job: OSX_NDK_ARMV7 pool: - vmImage: 'macOS-11' + vmImage: 'macOS-12' steps: - script: | brew update @@ -252,35 +252,35 @@ jobs: - job: OSX_IOS_ARMV8 pool: - vmImage: 'macOS-11' + vmImage: 'macOS-12' variables: - CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 + CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0 steps: - script: | make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 - job: OSX_IOS_ARMV7 pool: - vmImage: 'macOS-11' + vmImage: 'macOS-12' variables: - CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 + CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1 steps: - script: | make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 - job: OSX_xbuild_DYNAMIC_ARM64 pool: - vmImage: 'macOS-11' + vmImage: 'macOS-12' variables: - CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64 + CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64 steps: - script: | - ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs - /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus - /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version + ls /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs + /Applications/Xcode_12.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus + /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 - job: ALPINE_MUSL diff --git a/benchmark/pybench/README.md b/benchmark/pybench/README.md new file mode 100644 index 00000000..7523ca75 --- /dev/null +++ b/benchmark/pybench/README.md @@ -0,0 +1,49 @@ +# Continuous benchmarking of OpenBLAS performance + +We run a set of benchmarks of subset of OpenBLAS functionality. + +## Benchmark runner + +[![CodSpeed Badge](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/OpenMathLib/OpenBLAS/) + +Click on [benchmarks](https://codspeed.io/OpenMathLib/OpenBLAS/benchmarks) to see the performance of a particular benchmark over time; +Click on [branches](https://codspeed.io/OpenMathLib/OpenBLAS/branches/) and then on the last PR link to see the flamegraphs. + +## What are the benchmarks + +We run raw BLAS/LAPACK subroutines, via f2py-generated python wrappers. The wrappers themselves are equivalent to [those from SciPy](https://docs.scipy.org/doc/scipy/reference/linalg.lapack.html). +In fact, the wrappers _are_ from SciPy, we take a small subset simply to avoid having to build the whole SciPy for each CI run. + + +## Adding a new benchmark + +`.github/workflows/codspeed-bench.yml` does all the orchestration on CI. + +Benchmarks live in the `benchmark/pybench` directory. It is organized as follows: + +- benchmarks themselves live in the `benchmarks` folder. Note that the LAPACK routines are imported from the `openblas_wrap` package. +- the `openblas_wrap` package is a simple trampoline: it contains an f2py extension, `_flapack`, which talks to OpenBLAS, and exports the python names in its `__init__.py`. +This way, the `openblas_wrap` package shields the benchmarks from the details of where a particular LAPACK function comes from. If wanted, you may for instance swap the `_flapack` extension to +`scipy.linalg.blas` and `scipy.linalg.lapack`. + +To change parameters of an existing benchmark, edit python files in the `benchmark/pybench/benchmarks` directory. + +To add a benchmark for a new BLAS or LAPACK function, you need to: + +- add an f2py wrapper for the bare LAPACK function. You can simply copy a wrapper from SciPy (look for `*.pyf.src` files in https://github.com/scipy/scipy/tree/main/scipy/linalg) +- add an import to `benchmark/pybench/openblas_wrap/__init__.py` + + +## Running benchmarks locally + +This benchmarking layer is orchestrated from python, therefore you'll need to +have all what it takes to build OpenBLAS from source, plus `python` and + +``` +$ python -mpip install numpy meson ninja pytest pytest-benchmark +``` + +The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`. + +An ASV compatible benchmark suite is planned but currently not implemented. + diff --git a/benchmark/pybench/benchmarks/bench_blas.py b/benchmark/pybench/benchmarks/bench_blas.py new file mode 100644 index 00000000..8127dd0c --- /dev/null +++ b/benchmark/pybench/benchmarks/bench_blas.py @@ -0,0 +1,274 @@ +import pytest +import numpy as np +import openblas_wrap as ow + +dtype_map = { + 's': np.float32, + 'd': np.float64, + 'c': np.complex64, + 'z': np.complex128, + 'dz': np.complex128, +} + + +# ### BLAS level 1 ### + +# dnrm2 + +dnrm2_sizes = [100, 1000] + +def run_dnrm2(n, x, incx, func): + res = func(x, n, incx=incx) + return res + + +@pytest.mark.parametrize('variant', ['d', 'dz']) +@pytest.mark.parametrize('n', dnrm2_sizes) +def test_nrm2(benchmark, n, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + + x = np.array(rndm.uniform(size=(n,)), dtype=dtyp) + nrm2 = ow.get_func('nrm2', variant) + result = benchmark(run_dnrm2, n, x, 1, nrm2) + + +# ddot + +ddot_sizes = [100, 1000] + +def run_ddot(x, y, func): + res = func(x, y) + return res + + +@pytest.mark.parametrize('n', ddot_sizes) +def test_dot(benchmark, n): + rndm = np.random.RandomState(1234) + + x = np.array(rndm.uniform(size=(n,)), dtype=float) + y = np.array(rndm.uniform(size=(n,)), dtype=float) + dot = ow.get_func('dot', 'd') + result = benchmark(run_ddot, x, y, dot) + + +# daxpy + +daxpy_sizes = [100, 1000] + +def run_daxpy(x, y, func): + res = func(x, y, a=2.0) + return res + + +@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z']) +@pytest.mark.parametrize('n', daxpy_sizes) +def test_daxpy(benchmark, n, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + + x = np.array(rndm.uniform(size=(n,)), dtype=dtyp) + y = np.array(rndm.uniform(size=(n,)), dtype=dtyp) + axpy = ow.get_func('axpy', variant) + result = benchmark(run_daxpy, x, y, axpy) + + +# ### BLAS level 2 ### + +gemv_sizes = [100, 1000] + +def run_gemv(a, x, y, func): + res = func(1.0, a, x, y=y, overwrite_y=True) + return res + + +@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z']) +@pytest.mark.parametrize('n', gemv_sizes) +def test_dgemv(benchmark, n, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + + x = np.array(rndm.uniform(size=(n,)), dtype=dtyp) + y = np.empty(n, dtype=dtyp) + + a = np.array(rndm.uniform(size=(n,n)), dtype=dtyp) + x = np.array(rndm.uniform(size=(n,)), dtype=dtyp) + y = np.zeros(n, dtype=dtyp) + + gemv = ow.get_func('gemv', variant) + result = benchmark(run_gemv, a, x, y, gemv) + + assert result is y + + +# dgbmv + +dgbmv_sizes = [100, 1000] + +def run_gbmv(m, n, kl, ku, a, x, y, func): + res = func(m, n, kl, ku, 1.0, a, x, y=y, overwrite_y=True) + return res + + + +@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z']) +@pytest.mark.parametrize('n', dgbmv_sizes) +@pytest.mark.parametrize('kl', [1]) +def test_dgbmv(benchmark, n, kl, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + + x = np.array(rndm.uniform(size=(n,)), dtype=dtyp) + y = np.empty(n, dtype=dtyp) + + m = n + + a = rndm.uniform(size=(2*kl + 1, n)) + a = np.array(a, dtype=dtyp, order='F') + + gbmv = ow.get_func('gbmv', variant) + result = benchmark(run_gbmv, m, n, kl, kl, a, x, y, gbmv) + assert result is y + + +# ### BLAS level 3 ### + +# dgemm + +gemm_sizes = [100, 1000] + +def run_gemm(a, b, c, func): + alpha = 1.0 + res = func(alpha, a, b, c=c, overwrite_c=True) + return res + + +@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z']) +@pytest.mark.parametrize('n', gemm_sizes) +def test_gemm(benchmark, n, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + a = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F') + b = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F') + c = np.empty((n, n), dtype=dtyp, order='F') + gemm = ow.get_func('gemm', variant) + result = benchmark(run_gemm, a, b, c, gemm) + assert result is c + + +# dsyrk + +syrk_sizes = [100, 1000] + + +def run_syrk(a, c, func): + res = func(1.0, a, c=c, overwrite_c=True) + return res + + +@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z']) +@pytest.mark.parametrize('n', syrk_sizes) +def test_syrk(benchmark, n, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + a = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F') + c = np.empty((n, n), dtype=dtyp, order='F') + syrk = ow.get_func('syrk', variant) + result = benchmark(run_syrk, a, c, syrk) + assert result is c + + +# ### LAPACK ### + +# linalg.solve + +gesv_sizes = [100, 1000] + + +def run_gesv(a, b, func): + res = func(a, b, overwrite_a=True, overwrite_b=True) + return res + + +@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z']) +@pytest.mark.parametrize('n', gesv_sizes) +def test_gesv(benchmark, n, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + + a = (np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F') + + np.eye(n, dtype=dtyp, order='F')) + b = np.array(rndm.uniform(size=(n, 1)), dtype=dtyp, order='F') + gesv = ow.get_func('gesv', variant) + lu, piv, x, info = benchmark(run_gesv, a, b, gesv) + assert lu is a + assert x is b + assert info == 0 + + +# linalg.svd + +gesdd_sizes = [(100, 5), (1000, 222)] + + +def run_gesdd(a, lwork, func): + res = func(a, lwork=lwork, full_matrices=False, overwrite_a=False) + return res + + +@pytest.mark.parametrize('variant', ['s', 'd']) +@pytest.mark.parametrize('mn', gesdd_sizes) +def test_gesdd(benchmark, mn, variant): + m, n = mn + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + + a = np.array(rndm.uniform(size=(m, n)), dtype=dtyp, order='F') + + gesdd_lwork = ow.get_func('gesdd_lwork', variant) + + lwork, info = gesdd_lwork(m, n) + lwork = int(lwork) + assert info == 0 + + gesdd = ow.get_func('gesdd', variant) + u, s, vt, info = benchmark(run_gesdd, a, lwork, gesdd) + + assert info == 0 + + atol = {'s': 1e-5, 'd': 1e-13} + np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=atol[variant]) + + +# linalg.eigh + +syev_sizes = [50, 200] + + +def run_syev(a, lwork, func): + res = func(a, lwork=lwork, overwrite_a=True) + return res + + +@pytest.mark.parametrize('variant', ['s', 'd']) +@pytest.mark.parametrize('n', syev_sizes) +def test_syev(benchmark, n, variant): + rndm = np.random.RandomState(1234) + dtyp = dtype_map[variant] + + a = rndm.uniform(size=(n, n)) + a = np.asarray(a + a.T, dtype=dtyp, order='F') + a_ = a.copy() + + dsyev_lwork = ow.get_func('syev_lwork', variant) + lwork, info = dsyev_lwork(n) + lwork = int(lwork) + assert info == 0 + + syev = ow.get_func('syev', variant) + w, v, info = benchmark(run_syev, a, lwork, syev) + + assert info == 0 + assert a is v # overwrite_a=True + + diff --git a/benchmark/pybench/meson.build b/benchmark/pybench/meson.build new file mode 100644 index 00000000..5d921c9e --- /dev/null +++ b/benchmark/pybench/meson.build @@ -0,0 +1,48 @@ +# +# Taken from SciPy (of course) +# +project( + 'openblas-wrap', + 'c', 'fortran', + version: '0.1', + license: 'BSD-3', + meson_version: '>= 1.1.0', + default_options: [ + 'buildtype=debugoptimized', + 'b_ndebug=if-release', + 'c_std=c17', + 'fortran_std=legacy', + ], +) + +py3 = import('python').find_installation(pure: false) +py3_dep = py3.dependency() + +cc = meson.get_compiler('c') + +_global_c_args = cc.get_supported_arguments( + '-Wno-unused-but-set-variable', + '-Wno-unused-function', + '-Wno-conversion', + '-Wno-misleading-indentation', +) +add_project_arguments(_global_c_args, language : 'c') + +# We need -lm for all C code (assuming it uses math functions, which is safe to +# assume for SciPy). For C++ it isn't needed, because libstdc++/libc++ is +# guaranteed to depend on it. For Fortran code, Meson already adds `-lm`. +m_dep = cc.find_library('m', required : false) +if m_dep.found() + add_project_link_arguments('-lm', language : 'c') +endif + +generate_f2pymod = find_program('openblas_wrap/generate_f2pymod.py') + +openblas = dependency('openblas', method: 'pkg-config', required: true) +openblas_dep = declare_dependency( + dependencies: openblas, + compile_args: [] +) + + +subdir('openblas_wrap') diff --git a/benchmark/pybench/openblas_wrap/__init__.py b/benchmark/pybench/openblas_wrap/__init__.py new file mode 100644 index 00000000..9babb191 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/__init__.py @@ -0,0 +1,17 @@ +""" +Trampoline to hide the LAPACK details (scipy.lapack.linalg or scipy_openblas32 or...) +from benchmarking. +""" + +__version__ = "0.1" + + +from . import _flapack + +PREFIX = '' + + +def get_func(name, variant): + """get_func('gesv', 'c') -> cgesv etc.""" + return getattr(_flapack, PREFIX + variant + name) + diff --git a/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src new file mode 100644 index 00000000..1ee1d3c3 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src @@ -0,0 +1,417 @@ +! +! Taken from scipy/linalg +! +! Shorthand notations +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! Level 1 BLAS +! + + +python module _flapack + usercode ''' +#define F_INT int +''' + +interface + + +subroutine axpy(n,a,x,offx,incx,y,offy,incy) + ! Calculate z = a*x+y, where a is scalar. + + callstatement (*f2py_func)(&n,&a,x+offx,&incx,y+offy,&incy) + callprotoargument F_INT*,*,*,F_INT*,*,F_INT* + + dimension(*), intent(in) :: x + dimension(*), intent(in,out,out=z) :: y + optional, intent(in):: a=<1.0,\0,(1.0\,0.0),\2> + integer optional, intent(in),check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in),check(incy>0||incy<0) :: incy = 1 + integer optional, intent(in),depend(x) :: offx=0 + integer optional, intent(in),depend(y) :: offy=0 + check(offx>=0 && offx=0 && offy(n-1)*abs(incx)) :: n + check(len(y)-offy>(n-1)*abs(incy)) :: n + +end subroutine axpy + +function ddot(n,x,offx,incx,y,offy,incy) result (xy) + ! Computes a vector-vector dot product. + + callstatement ddot_return_value = (*f2py_func)(&n,x+offx,&incx,y+offy,&incy) + callprotoargument F_INT*,double*,F_INT*,double*,F_INT* + intent(c) ddot + fortranname F_FUNC(ddot,DDOT) + + double precision dimension(*), intent(in) :: x + double precision dimension(*), intent(in) :: y + double precision ddot,xy + integer optional, intent(in),check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in),check(incy>0||incy<0) :: incy = 1 + integer optional, intent(in),depend(x) :: offx=0 + integer optional, intent(in),depend(y) :: offy=0 + check(offx>=0 && offx=0 && offy(n-1)*abs(incx)) :: n + check(len(y)-offy>(n-1)*abs(incy)) :: n + +end function ddot + + +function nrm2(n,x,offx,incx) result(n2) + + nrm2, n2 + + callstatement nrm2_return_value = (*f2py_func)(&n,x+offx,&incx) + callprotoargument F_INT*,*,F_INT* + intent(c) nrm2 + fortranname F_FUNC(nrm2,NRM2) + + dimension(*),intent(in) :: x + + integer optional, intent(in),check(incx>0) :: incx = 1 + + integer optional,intent(in),depend(x) :: offx=0 + check(offx>=0 && offx(n-1)*abs(incx)) :: n + +end function nrm2 + + +! +! Level 2 BLAS +! + + +subroutine gemv(m,n,alpha,a,x,beta,y,offx,incx,offy,incy,trans,rows,cols,ly) + ! Computes a matrix-vector product using a general matrix + ! + ! y = gemv(alpha,a,x,beta=0,y=0,offx=0,incx=1,offy=0,incy=0,trans=0) + ! Calculate y <- alpha * op(A) * x + beta * y + + callstatement (*f2py_func)((trans?(trans==2?"C":"T"):"N"),&m,&n,&alpha,a,&m, & + x+offx,&incx,&beta,y+offy,&incy) + callprotoargument char*,F_INT*,F_INT*,*,*,F_INT*,*,F_INT*,*, & + *,F_INT* + + integer optional, intent(in), check(trans>=0 && trans <=2) :: trans = 0 + integer optional, intent(in), check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in), check(incy>0||incy<0) :: incy = 1 + intent(in) :: alpha + intent(in), optional :: beta = <0.0,\0,(0.0\,0.0),\2> + + dimension(*), intent(in) :: x + dimension(ly), intent(in,copy,out), depend(ly),optional :: y + integer intent(hide), depend(incy,rows,offy) :: ly = & + (y_capi==Py_None?1+offy+(rows-1)*abs(incy):-1) + dimension(m,n), intent(in) :: a + integer depend(a), intent(hide):: m = shape(a,0) + integer depend(a), intent(hide):: n = shape(a,1) + + integer optional, intent(in) :: offx=0 + integer optional, intent(in) :: offy=0 + check(offx>=0 && offxoffx+(cols-1)*abs(incx)) :: x + depend(offx,cols,incx) :: x + + check(offy>=0 && offyoffy+(rows-1)*abs(incy)) :: y + depend(offy,rows,incy) :: y + + integer depend(m,n,trans), intent(hide) :: rows = (trans?n:m) + integer depend(m,n,trans), intent(hide) :: cols = (trans?m:n) + +end subroutine gemv + + +subroutine gbmv(m,n,kl,ku,alpha,a,lda,x,incx,offx,beta,y,incy,offy,trans,ly) + ! Performs one of the matrix-vector operations + ! + ! y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, + ! or y := alpha*A**H*x + beta*y, + ! + ! where alpha and beta are scalars, x and y are vectors and A is an + ! m by n band matrix, with kl sub-diagonals and ku super-diagonals. + + callstatement (*f2py_func)((trans?(trans==2?"C":"T"):"N"),&m,&n,&kl,&ku,&alpha,a,&lda,x+offx,&incx,&beta,y+offy,&incy) + callprotoargument char*,F_INT*,F_INT*,F_INT*,F_INT*,*,*,F_INT*,*,F_INT*,*,*,F_INT* + + integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0 + integer intent(in), depend(ku,kl),check(m>=ku+kl+1) :: m + integer intent(in),check(n>=0&&n==shape(a,1)),depend(a) :: n + integer intent(in),check(kl>=0) :: kl + integer intent(in),check(ku>=0) :: ku + integer intent(hide),depend(a) :: lda = MAX(shape(a,0),1) + integer optional, intent(in),check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in),check(incy>0||incy<0) :: incy = 1 + integer intent(hide),depend(m,n,incy,offy,trans) :: ly = & + (y_capi==Py_None?1+offy+(trans==0?m-1:n-1)*abs(incy):-1) + integer optional, intent(in) :: offx=0 + integer optional, intent(in) :: offy=0 + + intent(in) :: alpha + intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2> + + dimension(lda,n),intent(in) :: a + + dimension(ly), intent(in,out,copy,out=yout),depend(ly),optional :: y + check(offy>=0 && offyoffy+(trans==0?m-1:n-1)*abs(incy)) :: y + depend(offy,n,incy) :: y + + dimension(*), intent(in) :: x + check(offx>=0 && offxoffx+(trans==0?n-1:m-1)*abs(incx)) :: x + depend(offx,n,incx) :: x + +end subroutine gbmv + + + +! +! Level 3 BLAS +! + + +subroutine gemm(m,n,k,alpha,a,b,beta,c,trans_a,trans_b,lda,ka,ldb,kb) + ! Computes a scalar-matrix-matrix product and adds the result to a + ! scalar-matrix product. + ! + ! c = gemm(alpha,a,b,beta=0,c=0,trans_a=0,trans_b=0,overwrite_c=0) + ! Calculate C <- alpha * op(A) * op(B) + beta * C + + callstatement (*f2py_func)((trans_a?(trans_a==2?"C":"T"):"N"), & + (trans_b?(trans_b==2?"C":"T"):"N"),&m,&n,&k,&alpha,a,&lda,b,&ldb,&beta,c,&m) + callprotoargument char*,char*,F_INT*,F_INT*,F_INT*,*,*,F_INT*,*, & + F_INT*,*,*,F_INT* + + integer optional,intent(in),check(trans_a>=0 && trans_a <=2) :: trans_a = 0 + integer optional,intent(in),check(trans_b>=0 && trans_b <=2) :: trans_b = 0 + intent(in) :: alpha + intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2> + + dimension(lda,ka),intent(in) :: a + dimension(ldb,kb),intent(in) :: b + dimension(m,n),intent(in,out,copy),depend(m,n),optional :: c + check(shape(c,0)==m && shape(c,1)==n) :: c + + integer depend(a),intent(hide) :: lda = shape(a,0) + integer depend(a),intent(hide) :: ka = shape(a,1) + integer depend(b),intent(hide) :: ldb = shape(b,0) + integer depend(b),intent(hide) :: kb = shape(b,1) + + integer depend(a,trans_a,ka,lda),intent(hide):: m = (trans_a?ka:lda) + integer depend(a,trans_a,ka,lda),intent(hide):: k = (trans_a?lda:ka) + integer depend(b,trans_b,kb,ldb,k),intent(hide),check(trans_b?kb==k:ldb==k) :: & + n = (trans_b?ldb:kb) + +end subroutine gemm + + +subroutine rk(n,k,alpha,a,beta,c,trans,lower,lda,ka) + ! performs one of the symmetric rank k operations + ! C := alpha*A*A**T + beta*C, or C := alpha*A**T*A + beta*C, + ! + ! c = syrk(alpha,a,beta=0,c=0,trans=0,lower=0,overwrite_c=0) + ! + callstatement (*f2py_func)((lower?"L":"U"), & + (trans?(trans==2?"C":"T"):"N"), &n,&k,&alpha,a,&lda,&beta,c,&n) + callprotoargument char*,char*,F_INT*,F_INT*,*,*,F_INT*,*, & + *,F_INT* + + integer optional, intent(in),check(lower==0||lower==1) :: lower = 0 + integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0 + + intent(in) :: alpha + intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2,\2,\2> + + dimension(lda,ka),intent(in) :: a + dimension(n,n),intent(in,out,copy),depend(n),optional :: c + check(shape(c,0)==n && shape(c,1)==n) :: c + + integer depend(a),intent(hide) :: lda = shape(a,0) + integer depend(a),intent(hide) :: ka = shape(a,1) + + integer depend(a, trans, ka, lda), intent(hide) :: n = (trans ? ka : lda) + integer depend(a, trans, ka, lda), intent(hide) :: k = (trans ? lda : ka) + +end subroutine rk + + +! +! LAPACK +! + +subroutine gesv(n,nrhs,a,piv,b,info) + ! lu,piv,x,info = gesv(a,b,overwrite_a=0,overwrite_b=0) + ! Solve A * X = B. + ! A = P * L * U + ! U is upper diagonal triangular, L is unit lower triangular, + ! piv pivots columns. + + callstatement {F_INT i;(*f2py_func)(&n,&nrhs,a,&n,piv,b,&n,&info);for(i=0;i\*,F_INT*,F_INT*,*,F_INT*,F_INT* + + integer depend(a),intent(hide):: n = shape(a,0) + integer depend(b),intent(hide):: nrhs = shape(b,1) + dimension(n,n),check(shape(a,0)==shape(a,1)) :: a + integer dimension(n),depend(n),intent(out) :: piv + dimension(n,nrhs),check(shape(a,0)==shape(b,0)),depend(n) :: b + integer intent(out)::info + intent(in,out,copy,out=x) b + intent(in,out,copy,out=lu) a +end subroutine gesv + + +subroutine gesdd(m,n,minmn,u0,u1,vt0,vt1,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info) + ! u,s,vt,info = gesdd(a,compute_uv=1,lwork=..,overwrite_a=0) + ! Compute the singular value decomposition (SVD) using divide and conquer: + ! A = U * SIGMA * transpose(V) + ! A - M x N matrix + ! U - M x M matrix or min(M,N) x N if full_matrices=False + ! SIGMA - M x N zero matrix with a main diagonal filled with min(M,N) + ! singular values + ! transpose(V) - N x N matrix or N x min(M,N) if full_matrices=False + + callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,a,&m,s,u,&u0,vt,&vt0,work,&lwork,iwork,&info) + callprotoargument char*,F_INT*,F_INT*,*,F_INT*,*,*,F_INT*,*,F_INT*,*,F_INT*,F_INT*,F_INT* + + integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1 + integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1 + integer intent(hide),depend(a):: m = shape(a,0) + integer intent(hide),depend(a):: n = shape(a,1) + integer intent(hide),depend(m,n):: minmn = MIN(m,n) + integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: u1 = (compute_uv?(full_matrices?m:minmn):1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1) + integer intent(hide),depend(compute_uv,minmn) :: vt1 = (compute_uv?n:1) + dimension(m,n),intent(in,copy,aligned8) :: a + dimension(minmn),intent(out),depend(minmn) :: s + dimension(u0,u1),intent(out),depend(u0, u1) :: u + dimension(vt0,vt1),intent(out),depend(vt0, vt1) :: vt + dimension(lwork),intent(hide,cache),depend(lwork) :: work + integer optional,intent(in),depend(minmn,compute_uv) & + :: lwork = max((compute_uv?4*minmn*minmn+MAX(m,n)+9*minmn:MAX(14*minmn+4,10*minmn+2+25*(25+8))+MAX(m,n)),1) + integer intent(hide,cache),dimension(8*minmn),depend(minmn) :: iwork + integer intent(out)::info + +end subroutine gesdd + +subroutine gesdd_lwork(m,n,minmn,u0,vt0,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info) + ! LWORK computation for (S/D)GESDD + + fortranname gesdd + callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,&a,&m,&s,&u,&u0,&vt,&vt0,&work,&lwork,&iwork,&info) + callprotoargument char*,F_INT*,F_INT*,*,F_INT*,*,*,F_INT*,*,F_INT*,*,F_INT*,F_INT*,F_INT* + + integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1 + integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1 + integer intent(in) :: m + integer intent(in) :: n + integer intent(hide),depend(m,n):: minmn = MIN(m,n) + integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1) + intent(hide) :: a + intent(hide) :: s + intent(hide) :: u + intent(hide) :: vt + intent(out) :: work + integer intent(hide) :: lwork = -1 + integer intent(hide) :: iwork + integer intent(out) :: info + +end subroutine gesdd_lwork + + +subroutine syev(compute_v,lower,n,w,a,lda,work,lwork,info) + ! w,v,info = syev(a,compute_v=1,lower=0,lwork=3*n-1,overwrite_a=0) + ! Compute all eigenvalues and, optionally, eigenvectors of a + ! real symmetric matrix A. + ! + ! Performance tip: + ! If compute_v=0 then set also overwrite_a=1. + + callstatement (*f2py_func)((compute_v?"V":"N"),(lower?"L":"U"),&n,a,&lda,w,work,&lwork,&info) + callprotoargument char*,char*,F_INT*,*,F_INT*,*,*,F_INT*,F_INT* + + integer optional,intent(in):: compute_v = 1 + check(compute_v==1||compute_v==0) compute_v + integer optional,intent(in),check(lower==0||lower==1) :: lower = 0 + + integer intent(hide),depend(a):: n = shape(a,0) + integer intent(hide),depend(a):: lda = MAX(1,shape(a,0)) + dimension(n,n),check(shape(a,0)==shape(a,1)) :: a + intent(in,copy,out,out=v) :: a + + dimension(n),intent(out),depend(n) :: w + + integer optional,intent(in),depend(n) :: lwork=max(3*n-1,1) + check(lwork>=3*n-1) :: lwork + dimension(lwork),intent(hide),depend(lwork) :: work + + integer intent(out) :: info + +end subroutine syev + + +subroutine syev_lwork(lower,n,w,a,lda,work,lwork,info) + ! LWORK routines for syev + + fortranname syev + + callstatement (*f2py_func)("N",(lower?"L":"U"),&n,&a,&lda,&w,&work,&lwork,&info) + callprotoargument char*,char*,F_INT*,*,F_INT*,*,*,F_INT*,F_INT* + + integer intent(in):: n + integer optional,intent(in),check(lower==0||lower==1) :: lower = 0 + + integer intent(hide),depend(n):: lda = MAX(1, n) + intent(hide):: a + intent(hide):: w + integer intent(hide):: lwork = -1 + + intent(out):: work + integer intent(out):: info + +end subroutine syev_lwork + +end interface + +end python module _flapack + + + diff --git a/benchmark/pybench/openblas_wrap/generate_f2pymod.py b/benchmark/pybench/openblas_wrap/generate_f2pymod.py new file mode 100644 index 00000000..5a8ba138 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/generate_f2pymod.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Process f2py template files (`filename.pyf.src` -> `filename.pyf`) + +Usage: python generate_pyf.py filename.pyf.src -o filename.pyf +""" + +import os +import sys +import re +import subprocess +import argparse + + +# START OF CODE VENDORED FROM `numpy.distutils.from_template` +############################################################# +""" +process_file(filename) + + takes templated file .xxx.src and produces .xxx file where .xxx + is .pyf .f90 or .f using the following template rules: + + '<..>' denotes a template. + + All function and subroutine blocks in a source file with names that + contain '<..>' will be replicated according to the rules in '<..>'. + + The number of comma-separated words in '<..>' will determine the number of + replicates. + + '<..>' may have two different forms, named and short. For example, + + named: + where anywhere inside a block '

' will be replaced with + 'd', 's', 'z', and 'c' for each replicate of the block. + + <_c> is already defined: <_c=s,d,c,z> + <_t> is already defined: <_t=real,double precision,complex,double complex> + + short: + , a short form of the named, useful when no

appears inside + a block. + + In general, '<..>' contains a comma separated list of arbitrary + expressions. If these expression must contain a comma|leftarrow|rightarrow, + then prepend the comma|leftarrow|rightarrow with a backslash. + + If an expression matches '\\' then it will be replaced + by -th expression. + + Note that all '<..>' forms in a block must have the same number of + comma-separated entries. + + Predefined named template rules: + + + + + +""" + +routine_start_re = re.compile( + r'(\n|\A)(( (\$|\*))|)\s*(subroutine|function)\b', + re.I +) +routine_end_re = re.compile(r'\n\s*end\s*(subroutine|function)\b.*(\n|\Z)', re.I) +function_start_re = re.compile(r'\n (\$|\*)\s*function\b', re.I) + +def parse_structure(astr): + """ Return a list of tuples for each function or subroutine each + tuple is the start and end of a subroutine or function to be + expanded. + """ + + spanlist = [] + ind = 0 + while True: + m = routine_start_re.search(astr, ind) + if m is None: + break + start = m.start() + if function_start_re.match(astr, start, m.end()): + while True: + i = astr.rfind('\n', ind, start) + if i==-1: + break + start = i + if astr[i:i+7]!='\n $': + break + start += 1 + m = routine_end_re.search(astr, m.end()) + ind = end = m and m.end()-1 or len(astr) + spanlist.append((start, end)) + return spanlist + +template_re = re.compile(r"<\s*(\w[\w\d]*)\s*>") +named_re = re.compile(r"<\s*(\w[\w\d]*)\s*=\s*(.*?)\s*>") +list_re = re.compile(r"<\s*((.*?))\s*>") + +def find_repl_patterns(astr): + reps = named_re.findall(astr) + names = {} + for rep in reps: + name = rep[0].strip() or unique_key(names) + repl = rep[1].replace(r'\,', '@comma@') + thelist = conv(repl) + names[name] = thelist + return names + +def find_and_remove_repl_patterns(astr): + names = find_repl_patterns(astr) + astr = re.subn(named_re, '', astr)[0] + return astr, names + +item_re = re.compile(r"\A\\(?P\d+)\Z") +def conv(astr): + b = astr.split(',') + l = [x.strip() for x in b] + for i in range(len(l)): + m = item_re.match(l[i]) + if m: + j = int(m.group('index')) + l[i] = l[j] + return ','.join(l) + +def unique_key(adict): + """ Obtain a unique key given a dictionary.""" + allkeys = list(adict.keys()) + done = False + n = 1 + while not done: + newkey = '__l%s' % (n) + if newkey in allkeys: + n += 1 + else: + done = True + return newkey + + +template_name_re = re.compile(r'\A\s*(\w[\w\d]*)\s*\Z') +def expand_sub(substr, names): + substr = substr.replace(r'\>', '@rightarrow@') + substr = substr.replace(r'\<', '@leftarrow@') + lnames = find_repl_patterns(substr) + substr = named_re.sub(r"<\1>", substr) # get rid of definition templates + + def listrepl(mobj): + thelist = conv(mobj.group(1).replace(r'\,', '@comma@')) + if template_name_re.match(thelist): + return "<%s>" % (thelist) + name = None + for key in lnames.keys(): # see if list is already in dictionary + if lnames[key] == thelist: + name = key + if name is None: # this list is not in the dictionary yet + name = unique_key(lnames) + lnames[name] = thelist + return "<%s>" % name + + substr = list_re.sub(listrepl, substr) # convert all lists to named templates + # newnames are constructed as needed + + numsubs = None + base_rule = None + rules = {} + for r in template_re.findall(substr): + if r not in rules: + thelist = lnames.get(r, names.get(r, None)) + if thelist is None: + raise ValueError('No replicates found for <%s>' % (r)) + if r not in names and not thelist.startswith('_'): + names[r] = thelist + rule = [i.replace('@comma@', ',') for i in thelist.split(',')] + num = len(rule) + + if numsubs is None: + numsubs = num + rules[r] = rule + base_rule = r + elif num == numsubs: + rules[r] = rule + else: + print("Mismatch in number of replacements (base <{}={}>) " + "for <{}={}>. Ignoring." + .format(base_rule, ','.join(rules[base_rule]), r, thelist)) + if not rules: + return substr + + def namerepl(mobj): + name = mobj.group(1) + return rules.get(name, (k+1)*[name])[k] + + newstr = '' + for k in range(numsubs): + newstr += template_re.sub(namerepl, substr) + '\n\n' + + newstr = newstr.replace('@rightarrow@', '>') + newstr = newstr.replace('@leftarrow@', '<') + return newstr + +def process_str(allstr): + newstr = allstr + writestr = '' + + struct = parse_structure(newstr) + + oldend = 0 + names = {} + names.update(_special_names) + for sub in struct: + cleanedstr, defs = find_and_remove_repl_patterns(newstr[oldend:sub[0]]) + writestr += cleanedstr + names.update(defs) + writestr += expand_sub(newstr[sub[0]:sub[1]], names) + oldend = sub[1] + writestr += newstr[oldend:] + + return writestr + +include_src_re = re.compile( + r"(\n|\A)\s*include\s*['\"](?P[\w\d./\\]+\.src)['\"]", + re.I +) + +def resolve_includes(source): + d = os.path.dirname(source) + with open(source) as fid: + lines = [] + for line in fid: + m = include_src_re.match(line) + if m: + fn = m.group('name') + if not os.path.isabs(fn): + fn = os.path.join(d, fn) + if os.path.isfile(fn): + lines.extend(resolve_includes(fn)) + else: + lines.append(line) + else: + lines.append(line) + return lines + +def process_file(source): + lines = resolve_includes(source) + return process_str(''.join(lines)) + +_special_names = find_repl_patterns(''' +<_c=s,d,c,z> +<_t=real,double precision,complex,double complex> + + + + + +''') + +# END OF CODE VENDORED FROM `numpy.distutils.from_template` +########################################################### + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("infile", type=str, + help="Path to the input file") + parser.add_argument("-o", "--outdir", type=str, + help="Path to the output directory") + args = parser.parse_args() + + if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')): + raise ValueError(f"Input file has unknown extension: {args.infile}") + + outdir_abs = os.path.join(os.getcwd(), args.outdir) + + # Write out the .pyf/.f file + if args.infile.endswith(('.pyf.src', '.f.src')): + code = process_file(args.infile) + fname_pyf = os.path.join(args.outdir, + os.path.splitext(os.path.split(args.infile)[1])[0]) + + with open(fname_pyf, 'w') as f: + f.write(code) + else: + fname_pyf = args.infile + + # Now invoke f2py to generate the C API module file + if args.infile.endswith(('.pyf.src', '.pyf')): + p = subprocess.Popen([sys.executable, '-m', 'numpy.f2py', fname_pyf, + '--build-dir', outdir_abs], #'--quiet'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.getcwd()) + out, err = p.communicate() + if not (p.returncode == 0): + raise RuntimeError(f"Writing {args.outfile} with f2py failed!\n" + f"{out}\n" + r"{err}") + + +if __name__ == "__main__": + main() diff --git a/benchmark/pybench/openblas_wrap/meson.build b/benchmark/pybench/openblas_wrap/meson.build new file mode 100644 index 00000000..9f1b7178 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/meson.build @@ -0,0 +1,50 @@ +# find numpy & f2py includes +inc_numpy = run_command(py3, + ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'], + check : true +).stdout().strip() + +inc_f2py = run_command(py3, + ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'], + check : true +).stdout().strip() + + +inc_np = include_directories(inc_numpy, inc_f2py) +fortranobject_c = inc_f2py / 'fortranobject.c' + + +fortranobject_lib = static_library('_fortranobject', + fortranobject_c, +# c_args: numpy_nodepr_api, + dependencies: py3_dep, + include_directories: [inc_np, inc_f2py], + gnu_symbol_visibility: 'hidden', +) +fortranobject_dep = declare_dependency( + link_with: fortranobject_lib, + include_directories: [inc_np, inc_f2py], +) + + +# f2py generated wrappers + +flapack_module = custom_target('flapack_module', + output: ['_flapackmodule.c'], + input: 'blas_lapack.pyf.src', + command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'], +) + +py3.extension_module('_flapack', + flapack_module, + link_args: [], # version_link_args, + dependencies: [openblas_dep, fortranobject_dep], + install: true, + subdir: 'openblas_wrap' +) + + +py3.install_sources( + ['__init__.py'], + subdir: 'openblas_wrap' +) diff --git a/benchmark/pybench/scipy_openblas.pc b/benchmark/pybench/scipy_openblas.pc new file mode 100644 index 00000000..2348fac6 --- /dev/null +++ b/benchmark/pybench/scipy_openblas.pc @@ -0,0 +1,12 @@ +libdir=/home/br/repos/OpenBLAS/ +includedir=/home/br/repos/OpenBLAS/ +openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64 +version=0.3.27 +extralib=-lm -lpthread -lgfortran -lquadmath -L${libdir} -lopenblas +Name: openblas +Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version +Version: ${version} +URL: https://github.com/xianyi/OpenBLAS +Libs: -L${libdir} -lopenblas +Libs.private: ${extralib} +Cflags: -I${includedir} diff --git a/c_check b/c_check index 59ab9bb1..c2b52c81 100755 --- a/c_check +++ b/c_check @@ -197,10 +197,22 @@ fi no_lsx=0 no_lasx=0 if [ "$architecture" = "loongarch64" ]; then + lasx_flags='-march=loongarch64' + lsx_flags='-march=loongarch64' + tmpd="$(mktemp -d)" + tmparch="$tmpd/arch.c" + printf "void main(void){ }\n" >> "$tmparch" + args="-march=loongarch64 -o $tmparch.o $tmparch" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + lasx_flags='' + lsx_flags='' + } + tmplsx="$tmpd/lsx.c" codelsx='"vadd.b $vr0, $vr0, $vr0"' - lsx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" args="$lsx_flags -o $tmplsx.o $tmplsx" { @@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then tmplasx="$tmpd/lasx.c" codelasx='"xvadd.b $xr0, $xr0, $xr0"' - lasx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" args="$lasx_flags -o $tmplasx.o $tmplasx" { @@ -345,6 +356,9 @@ if [ "$compiler" = "GCC" ]; then no_avx2=0 oldgcc=0 data=`$compiler_name -dumpversion` + case "$data" in *-*) + data="${data%-*}" + esac case "$data" in *.*.*) data="${data%.*}" esac diff --git a/cblas.h b/cblas.h index beaa32cc..097b4303 100644 --- a/cblas.h +++ b/cblas.h @@ -26,6 +26,11 @@ char* openblas_get_config(void); /*Get the CPU corename on runtime.*/ char* openblas_get_corename(void); +/*Set the threading backend to a custom callback.*/ +typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data); +typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data); +void openblas_set_threads_callback_function(openblas_threads_callback callback); + #ifdef OPENBLAS_OS_LINUX /* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); @@ -411,6 +416,18 @@ void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, double *c, OPENBLAS_CONST blasint cldc); +void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, + OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST float ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST float ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); + +void cblas_dgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, + OPENBLAS_CONST double * alpha_array, OPENBLAS_CONST double ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST double ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST double * beta_array, double ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); + +void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, + OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); + +void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, + OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); + /*** BFLOAT16 and INT8 extensions ***/ /* convert float array to BFLOAT16 array by rounding */ void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); @@ -426,6 +443,9 @@ void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); +void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, + OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/cmake/arch.cmake b/cmake/arch.cmake index eb974456..5f3703ae 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) @@ -57,7 +57,11 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT") endif () - + + if (RISCV64) + set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B) + endif () + if (X86) set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 2da941af..775239e1 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -2,12 +2,18 @@ ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets C related variables. +include(CheckCCompilerFlag) -if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang") +if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM") + set(CCOMMON_OPT "${CCOMMON_OPT} -fp-model=consistent") + set(GCC_VERSION 100) +endif () +if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang") set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") set(COMMON_PROF "${COMMON_PROF} -fno-inline") set(NO_UNINITIALIZED_WARN "-Wno-uninitialized") + set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION}) if (QUIET_MAKE) set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") @@ -36,14 +42,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (LOONGARCH64) if (BINARY64) - CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) if(COMPILER_SUPPORT_LP64D_ABI) set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d") else() set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") endif () else () - CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) if(COMPILER_SUPPORT_ILP32D_ABI) set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d") else() @@ -139,7 +145,6 @@ endif () if (${CORE} STREQUAL COOPERLAKE) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (CCOMMON_OPT "${CCOMMON_OPT} -march=cooperlake") else () @@ -152,7 +157,6 @@ endif () if (${CORE} STREQUAL SAPPHIRERAPIDS) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") else () @@ -166,7 +170,6 @@ if (${CORE} STREQUAL ZEN) if (HAVE_AVX512VL) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 13.0 OR ${GCC_VERSION} VERSION_EQUAL 13.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=znver4") else () @@ -179,7 +182,6 @@ endif () if (${CORE} STREQUAL A64FX) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") else () @@ -193,7 +195,6 @@ if (${CORE} STREQUAL NEOVERSEN2) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () @@ -208,7 +209,6 @@ if (${CORE} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") else () @@ -220,7 +220,6 @@ endif () if (${CORE} STREQUAL NEOVERSEN1) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") else () @@ -265,23 +264,21 @@ endif () if (${CORE} STREQUAL POWER10) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") else () - message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) + message(FATAL_ERROR "Compiler GCC ${GCC_VERSION} does not support Power10." ) endif() endif () endif () if (${CORE} STREQUAL POWER9) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") else () set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") - message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") + message(WARNING "Compiler GCC ${GCC_VERSION} does not fully support Power9.") endif () endif () endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 4d3da1a2..8798ce8b 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,14 +61,17 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (LOONGARCH64) if (BINARY64) - CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) if(COMPILER_SUPPORT_LP64D_ABI) set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") else() set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") endif () + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () - CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) if(COMPILER_SUPPORT_ILP32D_ABI) set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") else() @@ -114,12 +117,12 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () endif () -if (${F_COMPILER} STREQUAL "INTEL") +if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () - set(FCOMMON_OPT "${FCOMMON_OPT} -recursive") + set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () diff --git a/cmake/os.cmake b/cmake/os.cmake index e24059dd..2effbe0e 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -38,7 +38,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") # Test for supporting MS_ABI # removed string parsing in favor of CMake's version comparison -hpa - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION}) if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) # GCC Version >=4.7 # It is compatible with MSVC ABI. diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 47e95841..609fbe24 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1218,6 +1218,37 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "A64FX") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t256\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t256\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t8388608\n\n" + "#define L2_LINESIZE\t256\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t0\n\n" + "#define L3_LINESIZE\t0\n\n" + "#define L3_ASSOCIATIVE\t0\n\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "P5600") file(APPEND ${TARGET_CONF_TEMP} "#define L2_SIZE 1048576\n" @@ -1309,6 +1340,15 @@ endif () "#define DTB_DEFAULT_ENTRIES 128\n" "#define DTB_SIZE 4096\n" "#define L2_ASSOCIATIVE 8\n") + elseif ("${TCORE}" STREQUAL "RISCV64_GENERIC") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 32\n" + "#define L2_SIZE 1048576\n" + "#define L2_LINESIZE 32 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 4\n") endif() set(SBGEMM_UNROLL_M 8) set(SBGEMM_UNROLL_N 4) @@ -1342,7 +1382,7 @@ else(NOT CMAKE_CROSSCOMPILING) if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC") #Use generic for MSVC now - message("MSVC") + message(STATUS "MSVC") set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) else() list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) diff --git a/cmake/system.cmake b/cmake/system.cmake index c26b415c..683c3181 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -160,11 +160,16 @@ else() endif () endif () +if (C_LAPACK) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + set(CCOMMON_OPT "${CCOMMON_OPT} -Wno-error=incompatible-pointer-types") + endif () +endif () + include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (DEFINED TARGET) if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() @@ -172,15 +177,14 @@ if (DEFINED TARGET) endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake -exhaustive-register-search") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake -mllvm -exhaustive-register-search") else() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -exhaustive-register-search") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -mllvm -exhaustive-register-search") endif() endif() endif() if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") else() @@ -188,22 +192,21 @@ if (DEFINED TARGET) endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids -exhaustive-register-search") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids -mllvm -exhaustive-register-search") else() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -exhaustive-register-search") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -mllvm -exhaustive-register-search") endif() endif() endif() if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -exhaustive-register-search") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mllvm -exhaustive-register-search") endif() endif() if (((${TARGET} STREQUAL ZEN) AND HAVE_AVX512VL) AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.99) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4") else() @@ -215,14 +218,13 @@ if (DEFINED TARGET) else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -exhaustive-register-search") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mllvm -exhaustive-register-search") endif() endif() if ((${TARGET} STREQUAL HASWELL OR (${TARGET} STREQUAL ZEN AND NOT HAVE_AVX512VL)) AND NOT NO_AVX2) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 4.7 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 4.7) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") @@ -261,20 +263,18 @@ if (DEFINED TARGET) endif() if (${TARGET} STREQUAL POWER10) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + if (CMAKE_C_COMPILER VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") else () - message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") + message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.") endif() endif() if (${TARGET} STREQUAL POWER9) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 5.0 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 5.0) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") else () set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") - message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") + message(WARNING "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support fully Power9.") endif() endif() if (${TARGET} STREQUAL POWER8) @@ -285,11 +285,10 @@ if (${TARGET} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") else () - message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") + message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.") endif() endif() endif() @@ -297,11 +296,10 @@ if (${TARGET} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") + message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse N2.") endif() endif() endif() @@ -312,6 +310,18 @@ if (${TARGET} STREQUAL NEOVERSEV1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") endif() endif() + if (${TARGET} STREQUAL A64FX) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") + else () + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve -mtune=a64fx") + else () + message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support A64FX.") + endif() + endif() + endif() endif() @@ -381,12 +391,19 @@ endif () if (X86_64 OR ${CORE} STREQUAL POWER10) set(SMALL_MATRIX_OPT TRUE) endif () +if (ARM64) + set(GEMM_GEMV_FORWARD TRUE) +endif () + +if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD") +endif () if (SMALL_MATRIX_OPT) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") endif () if (DYNAMIC_ARCH) - if (X86 OR X86_64 OR ARM64 OR POWER) + if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") if (DYNAMIC_OLDER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") @@ -604,7 +621,10 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}") #For LAPACK Fortran codes. -set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}") +set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" ) +if (LAPACK_STRLEN) + set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}") +endif() set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") #Disable -fopenmp for LAPACK Fortran codes on Windows. @@ -617,7 +637,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endif () if (CMAKE_Fortran_COMPILER) -if (${F_COMPILER} STREQUAL "NAG" OR ${F_COMPILER} STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") +if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") message(STATUS "removing fortran flags") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 5e8ba866..9befc9a3 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -187,8 +187,8 @@ macro(ParseMakefileVars MAKEFILE_IN) set (HasValidGroup 1) set (STR ${CMAKE_MATCH_4}) endif () - if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) - if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) + if (DEFINED CMAKE_MATCH_1 AND ${HasValidGroup} EQUAL 1) + if (NOT (CMAKE_MATCH_1 STREQUAL ${STR})) #message (STATUS "condition is true") set (IfElse 1) continue () diff --git a/common_arm64.h b/common_arm64.h index 6ae6a35a..d80b9e43 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -55,6 +55,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +static __inline int WhereAmI(void){ + uint64_t ret; + __asm__ volatile ( + " mrs x0, mpidr_el1 \n" + " and x0, x0, 0xff \n" + :"=r" (ret) + :: "memory" + ); + ret +=1; + if ((int)ret <0) ret = 0; + return (int)ret; +} + static __inline void blas_lock(volatile BLASULONG *address){ BLASULONG ret; diff --git a/common_interface.h b/common_interface.h index 5a2e1654..efd3c664 100644 --- a/common_interface.h +++ b/common_interface.h @@ -47,6 +47,11 @@ int BLASFUNC(xerbla)(char *, blasint *info, blasint); void openblas_set_num_threads_(int *); +/*Set the threading backend to a custom callback.*/ +typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data); +typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data); +extern openblas_threads_callback openblas_threads_callback_; + FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/common_level3.h b/common_level3.h index 5080ada1..d370c1f9 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1937,8 +1937,13 @@ int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG); -int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); +int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); +int sgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); +int dgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); +int cgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); +int zgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); +int sbgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); #ifdef __CUDACC__ } diff --git a/common_loongarch64.h b/common_loongarch64.h index b1426da7..367e5df1 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -96,6 +96,32 @@ static inline int WhereAmI(void){ } #endif +static inline int get_cpu_model(char *model_name) { + FILE *cpuinfo_file = fopen("/proc/cpuinfo", "r"); + if (!cpuinfo_file) { + return 0; + } + char line[1024]; + while (fgets(line, sizeof(line), cpuinfo_file)) { + if (strstr(line, "model name")) { + char *token = strtok(line, ":"); + token = strtok(NULL, ":"); + while (*token == ' ') + token++; + char *end = token + strlen(token) - 1; + while (end > token && (*end == '\n' || *end == '\r')) { + *end = '\0'; + end--; + } + strcpy(model_name, token); + fclose(cpuinfo_file); + return 1; + } + } + fclose(cpuinfo_file); + return 0; +} + #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") #else diff --git a/common_macro.h b/common_macro.h index 3226d0f1..a924651d 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2655,9 +2655,20 @@ typedef struct { BLASLONG prea, preb, prec, pred; #endif + + //for gemm_batch + void * routine; + int routine_mode; + } blas_arg_t; #endif +#ifdef SMALL_MATRIX_OPT +#define BLAS_SMALL_OPT 0x10000U +#define BLAS_SMALL_B0_OPT 0x30000U +#endif + + #ifdef XDOUBLE #define TRSV_NUU qtrsv_NUU diff --git a/common_power.h b/common_power.h index 3fe776f2..6b13f06b 100644 --- a/common_power.h +++ b/common_power.h @@ -841,17 +841,17 @@ Lmcount$lazy_ptr: #endif #if defined(PPC440) -#define BUFFER_SIZE ( 2 << 20) +#define BUFFER_SIZE ( 2UL << 20) #elif defined(PPC440FP2) -#define BUFFER_SIZE ( 16 << 20) +#define BUFFER_SIZE ( 16UL << 20) #elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10) -#define BUFFER_SIZE ( 64 << 22) +#define BUFFER_SIZE ( 64UL << 22) #else -#define BUFFER_SIZE ( 16 << 20) +#define BUFFER_SIZE ( 16UL << 20) #endif #ifdef DYNAMIC_ARCH #undef BUFFER_SIZE -#define BUFFER_SIZE (64 << 22) +#define BUFFER_SIZE (64UL << 22) #endif #ifndef PAGESIZE diff --git a/common_thread.h b/common_thread.h index d37fcb18..4a8db682 100644 --- a/common_thread.h +++ b/common_thread.h @@ -111,8 +111,8 @@ typedef struct blas_queue { struct blas_queue *next; #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) - // CRITICAL_SECTION lock; - // HANDLE finish; + CRITICAL_SECTION lock; + HANDLE finish; volatile int finished; #else pthread_mutex_t lock; diff --git a/common_x86_64.h b/common_x86_64.h index dda168d6..21cd198f 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -253,7 +253,7 @@ static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ #ifndef BUFFERSIZE #define BUFFER_SIZE (32 << 22) #else -#define BUFFER_SIZE (32 << BUFFERSIZE) +#define BUFFER_SIZE (32UL << BUFFERSIZE) #endif #define SEEK_ADDRESS diff --git a/cpuid_arm64.c b/cpuid_arm64.c index b23edc4e..2cfa96ea 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -46,6 +46,7 @@ size_t length64=sizeof(value64); #define CPU_NEOVERSEN1 11 #define CPU_NEOVERSEV1 16 #define CPU_NEOVERSEN2 17 +#define CPU_NEOVERSEV2 24 #define CPU_CORTEXX1 18 #define CPU_CORTEXX2 19 #define CPU_CORTEXA510 20 @@ -91,7 +92,8 @@ static char *cpuname[] = { "CORTEXA510", "CORTEXA710", "FT2000", - "CORTEXA76" + "CORTEXA76", + "NEOVERSEV2" }; static char *cpuname_lower[] = { @@ -118,7 +120,8 @@ static char *cpuname_lower[] = { "cortexa510", "cortexa710", "ft2000", - "cortexa76" + "cortexa76", + "neoversev2" }; int get_feature(char *search) @@ -213,6 +216,8 @@ int detect(void) return CPU_CORTEXX2; else if (strstr(cpu_part, "0xd4e")) //X3 return CPU_CORTEXX2; + else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al. + return CPU_NEOVERSEV2; else if (strstr(cpu_part, "0xd0b")) return CPU_CORTEXA76; } @@ -425,6 +430,23 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_NEOVERSEV2: + printf("#define ARMV9\n"); + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + // L1 Data TLB = 48 entries + // L2 Data TLB = 2048 entries + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs. + break; case CPU_CORTEXA510: case CPU_CORTEXA710: case CPU_CORTEXX1: diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index 0ad32ae4..3b7a9c82 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include /* If LASX extension instructions supported, * using core LOONGSON3R5 diff --git a/cpuid_x86.c b/cpuid_x86.c index 6cf4d650..f77cca1d 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1529,12 +1529,14 @@ int get_cpuname(void){ switch (model) { case 5: // Comet Lake H and S case 6: // Comet Lake U + case 10: // Meteor Lake if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 0: // Meteor Lake case 7: // Rocket Lake if(support_avx512()) return CPUTYPE_SKYLAKEX; @@ -1560,6 +1562,19 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 12: //family 6 exmodel 12 + switch (model) { + case 15: + if(support_avx512()) + return CPUTYPE_SAPPHIRERAPIDS; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -2377,10 +2392,10 @@ int get_coretype(void){ else return CORE_NEHALEM; } - case 15: - if (model <= 0x2) return CORE_NORTHWOOD; - else return CORE_PRESCOTT; } + case 15: + if (model <= 0x2) return CORE_NORTHWOOD; + else return CORE_PRESCOTT; } } @@ -2511,6 +2526,7 @@ int get_coretype(void){ case 0x7: switch (exmodel) { case 5: + case 6: if (support_avx2()) return CORE_ZEN; else diff --git a/ctest/Makefile b/ctest/Makefile index bbaf96f8..c02e04e1 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -25,6 +25,9 @@ endif override CFLAGS += -DADD$(BU) -DCBLAS ifeq ($(F_COMPILER),GFORTRAN) +ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) + override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 +endif override FFLAGS += -fno-tree-vectorize endif override TARGET_ARCH= @@ -203,7 +206,6 @@ ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 endif endif -endif ifeq ($(SUPPORT_GEMM3M),1) ifeq ($(USE_OPENMP), 1) @@ -222,7 +224,7 @@ ifeq ($(BUILD_COMPLEX16),1) endif endif endif - +endif diff --git a/debian/README.Debian b/debian/README.Debian index 0c255d63..03033e9a 100644 --- a/debian/README.Debian +++ b/debian/README.Debian @@ -20,10 +20,10 @@ More information is available at: Building an optimized OpenBLAS package for your machine ======================================================= -On amd64, arm64, i386, loong64, ppc64el and s390x, libopenblas provides a -multiple architecture library. All kernels are included in the library and the -one matching best your processor is selected at run time. Recompiling locally -should bring minimal performance improvement. +On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, libopenblas +provides a multiple architecture library. All kernels are included in the +library and the one matching best your processor is selected at run time. +Recompiling locally should bring minimal performance improvement. On the contrary, on other architectures, the package is compiled with minimal optimizations, so that it can run on all hardware. You may want to recompile it @@ -69,5 +69,5 @@ using: $ aptitude versions libopenblas0 - -- Sébastien Villemot , Sat, 9 Sep 2023 17:15:50 +0200 + -- Sébastien Villemot , Sun, 11 Aug 2024 10:34:19 +0200 -- Sylvestre Ledru Tue, 31 May 2011 13:56:22 +0200 diff --git a/debian/changelog b/debian/changelog index f67e4cfc..c7866a74 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,10 +1,25 @@ -openblas (0.3.27+ds-2+rpi1) trixie-staging; urgency=medium +openblas (0.3.28+ds-2+rpi1) trixie-staging; urgency=medium [changes brought forward from 0.2.15-1+rpi1 by Peter Michael Green at Fri, 11 Mar 2016 23:39:32 +0000] * Disable arm-gcc-flags.patch which was enforcing gcc flags that are wrong for raspbian. - -- Raspbian forward porter Fri, 31 May 2024 04:39:55 +0000 + -- Raspbian forward porter Tue, 20 Aug 2024 23:47:01 +0000 + +openblas (0.3.28+ds-2) unstable; urgency=medium + + * gcc14-mips64el.patch: new patch, fixes FTBFS on mips64el + + -- Sébastien Villemot Tue, 13 Aug 2024 22:05:18 +0200 + +openblas (0.3.28+ds-1) unstable; urgency=medium + + * New upstream version 0.3.28+ds + * d/copyright: reflect upstream changes + * testsuite-64bit-big-endian.patch: drop patch, applied upstream + * Enable dynamic arch support on riscv64 + + -- Sébastien Villemot Sun, 11 Aug 2024 10:37:39 +0200 openblas (0.3.27+ds-2) unstable; urgency=medium diff --git a/debian/control b/debian/control index 6b9a1130..90a332c0 100644 --- a/debian/control +++ b/debian/control @@ -30,8 +30,9 @@ Breaks: libblas3 (<< 3.7.1-2~), Description: Optimized BLAS (linear algebra) library (meta) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -52,8 +53,9 @@ Provides: libblas.so.3, Description: Optimized BLAS (linear algebra) library (shared lib, pthread) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -76,8 +78,9 @@ Provides: libblas.so.3, Description: Optimized BLAS (linear algebra) library (shared lib, openmp) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -100,8 +103,9 @@ Provides: libblas.so.3, Description: Optimized BLAS (linear algebra) library (shared lib, serial) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -122,8 +126,9 @@ Breaks: libblas-dev (<< 3.7.1-2~), Description: Optimized BLAS (linear algebra) library (dev, meta) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -146,8 +151,9 @@ Provides: libblas.so, Description: Optimized BLAS (linear algebra) library (dev, pthread) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -172,8 +178,9 @@ Provides: libblas.so, Description: Optimized BLAS (linear algebra) library (dev, openmp) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -198,8 +205,9 @@ Provides: libblas.so, Description: Optimized BLAS (linear algebra) library (dev, serial) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -221,8 +229,9 @@ Depends: ${misc:Depends}, Description: Optimized BLAS (linear algebra) library (shared lib, 64bit, meta) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -241,8 +250,9 @@ Provides: libblas64.so.3, Description: Optimized BLAS (linear algebra) library (shared lib, 64bit, pthread) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -261,8 +271,9 @@ Provides: libblas64.so.3, Description: Optimized BLAS (linear algebra) library (shared lib, 64bit, openmp) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -281,8 +292,9 @@ Provides: libblas64.so.3, Description: Optimized BLAS (linear algebra) library (shared lib, 64bit, serial) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -300,8 +312,9 @@ Depends: libopenblas64-0 (= ${binary:Version}), Description: Optimized BLAS (linear algebra) library (dev, 64bit, meta) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -323,8 +336,9 @@ Provides: libblas64.so, Description: Optimized BLAS (linear algebra) library (dev, 64bit, pthread) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -346,8 +360,9 @@ Provides: libblas64.so, Description: Optimized BLAS (linear algebra) library (dev, 64bit, openmp) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for @@ -369,8 +384,9 @@ Provides: libblas64.so, Description: Optimized BLAS (linear algebra) library (dev, 64bit, serial) OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. . - On amd64, arm64, i386, loong64, ppc64el and s390x, all kernels are included in - the library and the one matching best your processor is selected at runtime. + On amd64, arm64, i386, loong64, ppc64el, riscv64 and s390x, all kernels are + included in the library and the one matching best your processor is selected + at runtime. . On other architectures, for maximum performance, you may want to rebuild OpenBLAS locally, see the section: “Building an optimized OpenBLAS package for diff --git a/debian/copyright b/debian/copyright index 85a3acb6..90ac75d9 100644 --- a/debian/copyright +++ b/debian/copyright @@ -311,6 +311,7 @@ Files: benchmark/cholesky.c kernel/generic/laswp_ncopy_1.c kernel/generic/laswp_ncopy_2.c kernel/generic/laswp_ncopy_4.c + kernel/generic/laswp_ncopy_6.c kernel/generic/laswp_ncopy_8.c kernel/generic/lsame.c kernel/generic/neg_tcopy_1.c @@ -475,6 +476,11 @@ Files: benchmark/cholesky.c kernel/generic/ztrsm_utcopy_2.c kernel/generic/ztrsm_utcopy_4.c kernel/generic/ztrsm_utcopy_8.c + kernel/loongarch64/gemm_ncopy_6.prefx.c + kernel/loongarch64/trsm_kernel_LN_UNROLLN6.c + kernel/loongarch64/trsm_kernel_LT_UNROLLN6.c + kernel/loongarch64/trsm_kernel_RN_UNROLLN6.c + kernel/loongarch64/trsm_kernel_RT_UNROLLN6.c kernel/power/lock.c kernel/power/dgemm_ncopy_8_power10.c kernel/power/sbgemm_ncopy_16_power10.c @@ -560,7 +566,7 @@ Files: benchmark/cholesky.c lapack/trtrs/ztrtrs_parallel.c lapack/trtrs/ztrtrs_single.c symcopy.h -Copyright: 2009-2010 The University of Texas at Austin +Copyright: 2009-2010, 2024 The University of Texas at Austin 2023-2024 The OpenBLAS Project License: BSD-2-clause-texas 1. Redistributions of source code must retain the above diff --git a/debian/patches/gcc14-mips64el.patch b/debian/patches/gcc14-mips64el.patch new file mode 100644 index 00000000..6f4db10c --- /dev/null +++ b/debian/patches/gcc14-mips64el.patch @@ -0,0 +1,33 @@ +Description: Fix FTBFS on mips64el with GCC 14 +Origin: upstream, https://github.com/OpenMathLib/OpenBLAS/pull/4864 +Bug: https://github.com/OpenMathLib/OpenBLAS/issues/4862 +Reviewed-by: Sébastien Villemot +Last-Update: 2024-08-13 +--- +This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ +diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c +index 7d6bcd776..6a2e4d430 100644 +--- a/lapack/potrf/potrf_L_parallel.c ++++ b/lapack/potrf/potrf_L_parallel.c +@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); + #else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, +- &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); ++ &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LN, sa, sb, args -> nthreads); + #endif + } + } +diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c +index 1f1427276..de7d33374 100644 +--- a/lapack/potrf/potrf_U_parallel.c ++++ b/lapack/potrf/potrf_U_parallel.c +@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); + #else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, +- &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); ++ &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UC, sa, sb, args -> nthreads); + #endif + } + } diff --git a/debian/patches/no-embedded-lapack.patch b/debian/patches/no-embedded-lapack.patch index 7a20e457..a8d29c00 100644 --- a/debian/patches/no-embedded-lapack.patch +++ b/debian/patches/no-embedded-lapack.patch @@ -8,7 +8,7 @@ Last-Update: 2017-07-27 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ --- a/Makefile +++ b/Makefile -@@ -282,95 +282,20 @@ hpl_p : +@@ -286,95 +286,20 @@ hpl_p : fi; \ done @@ -114,7 +114,7 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ large.tgz : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@@ -438,10 +363,5 @@ ifeq ($(OSNAME), Darwin) +@@ -442,10 +367,5 @@ ifeq ($(OSNAME), Darwin) endif @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @rm -f cblas.tmp cblas.tmp2 @@ -138,10 +138,11 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ # - CC is an implicit variable so neither '?=' or 'ifndef' can be used. --- a/interface/Makefile +++ b/interface/Makefile -@@ -2419,3 +2419,9 @@ cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PS - cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) +@@ -2433,3 +2433,10 @@ cblas_cgemm_batch.$(SUFFIX) cblas_cgemm_ + cblas_zgemm_batch.$(SUFFIX) cblas_zgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) ++ +# The list of symbols to be removed can be seen in the diff between LAPACK's +# original SRC/Makefile and the version of that same file that is included in +# OpenBLAS (unfiltered) tarball diff --git a/debian/patches/remove-openmp-warning.patch b/debian/patches/remove-openmp-warning.patch index 016f2d02..104a70cc 100644 --- a/debian/patches/remove-openmp-warning.patch +++ b/debian/patches/remove-openmp-warning.patch @@ -8,9 +8,9 @@ Last-Update: 2014-02-17 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c -@@ -893,16 +893,6 @@ int exec_blas(BLASLONG num, blas_queue_t - fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); - #endif +@@ -811,16 +811,6 @@ if (openblas_threads_callback_) { + } + -#ifdef __ELF__ - if (omp_in_parallel && (num > 1)) { diff --git a/debian/patches/series b/debian/patches/series index 251a6e24..6c8d071b 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -3,5 +3,5 @@ no-embedded-lapack.patch shared-blas-lapack.patch matgen-symbols-not-included.patch combssq-deprecate.patch -testsuite-64bit-big-endian.patch +gcc14-mips64el.patch auto-0.3.7+ds-1+rpi1-e5cdb5bc2fdbe4ed5849a311d639ae1e43ddffb5-1566623053 diff --git a/debian/patches/shared-blas-lapack.patch b/debian/patches/shared-blas-lapack.patch index dbdf6ca3..4366ce53 100644 --- a/debian/patches/shared-blas-lapack.patch +++ b/debian/patches/shared-blas-lapack.patch @@ -21,7 +21,7 @@ Last-Update: 2020-07-31 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ --- a/interface/Makefile +++ b/interface/Makefile -@@ -2425,3 +2425,28 @@ cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PS +@@ -2440,3 +2440,28 @@ cblas_zgemm_batch.$(SUFFIX) cblas_zgemm_ delete-duplicate-lapack-objects: if test -d ../lapack-netlib; then cd ../lapack-netlib \ && rm $(SLAPACKOBJS) $(DLAPACKOBJS) $(CLAPACKOBJS) $(ZLAPACKOBJS) lsame.o xerbla.o; fi diff --git a/debian/patches/testsuite-64bit-big-endian.patch b/debian/patches/testsuite-64bit-big-endian.patch deleted file mode 100644 index c8db7a27..00000000 --- a/debian/patches/testsuite-64bit-big-endian.patch +++ /dev/null @@ -1,26 +0,0 @@ -Description: Fix FTBFS on big-endian 64-bit architectures - A long* pointer was incorrectly cast as an int*. -Origin: upstream, https://github.com/OpenMathLib/OpenBLAS/commit/b1d722fc0cf563298de8c5ae66fc04a1901d4bf1 -Bug: https://github.com/OpenMathLib/OpenBLAS/issues/4633 -Reviewed-by: Sébastien Villemot -Last-Update: 2024-04-29 ---- -This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ ---- a/utest/test_extensions/xerbla.c -+++ b/utest/test_extensions/xerbla.c -@@ -39,7 +39,7 @@ static char *rout; - - static void F77_xerbla(char *srname, void *vinfo) - { -- int info=*(int*)vinfo; -+ blasint info=*(blasint*)vinfo; - - if (link_xerbla) - { -@@ -85,4 +85,4 @@ void set_xerbla(char* current_rout, int - lerr = TRUE; - _info = expected_info; - rout = current_rout; --} -\ No newline at end of file -+} diff --git a/debian/rules b/debian/rules index 1f6063ef..fd6a00cd 100755 --- a/debian/rules +++ b/debian/rules @@ -28,7 +28,7 @@ GENERIC_OPTIONS := NUM_THREADS=128 # Architectures with dynamic arch selection # TARGET=GENERIC is needed to avoid FTBFS when CPU detection fails (see #923607) -ENABLE_DYNAMIC_ARCHS := amd64 arm64 i386 ppc64el s390x loong64 +ENABLE_DYNAMIC_ARCHS := amd64 arm64 i386 ppc64el s390x loong64 riscv64 ifneq (,$(filter $(DEB_HOST_ARCH),$(ENABLE_DYNAMIC_ARCHS))) GENERIC_OPTIONS += DYNAMIC_ARCH=1 DYNAMIC_OLDER=1 TARGET=GENERIC endif @@ -54,7 +54,6 @@ TARGET_OPTION_ppc64 = TARGET=POWER4 # We do not use DYNAMIC_ARCH selection on mips64el because it only works for # Loongson3R3/3R4 TARGET_OPTION_mips64el = TARGET=MIPS64_GENERIC -TARGET_OPTION_riscv64 = TARGET=RISCV64_GENERIC TARGET_OPTION_sparc64 = TARGET=SPARC GENERIC_OPTIONS += $(TARGET_OPTION_$(DEB_HOST_ARCH)) diff --git a/docs/about.md b/docs/about.md new file mode 100644 index 00000000..dea1adab --- /dev/null +++ b/docs/about.md @@ -0,0 +1,53 @@ +## Mailing list + +We have a [GitHub discussions](https://github.com/OpenMathLib/OpenBLAS/discussions/) forum to discuss usage and development of OpenBLAS. We also have a [Google group for *users*](https://groups.google.com/forum/#!forum/openblas-users) and a [Google group for *development of*](https://groups.google.com/forum/#!forum/openblas-dev) OpenBLAS. + +## Acknowledgements + +This work was or is partially supported by the following grants, contracts and institutions: + +* Research and Development of Compiler System and Toolchain for Domestic CPU, National S&T Major Projects: Core Electronic Devices, High-end General Chips and Fundamental Software (No.2009ZX01036-001-002) +* National High-tech R&D Program of China (Grant No.2012AA010903) +* [PerfXLab](http://www.perfxlab.com/) +* Chan Zuckerberg Initiative's Essential Open Source Software for Science program: + * Cycle 1 grant: [Strengthening NumPy's foundations - growing beyond code](https://figshare.com/articles/journal_contribution/Proposal_NumPy_OpenBLAS_for_Chan_Zuckerberg_Initiative_EOSS_2019_round_1/10302167) (2019-2020) + * Cycle 3 grant: [Improving usability and sustainability for NumPy and OpenBLAS](https://chanzuckerberg.com/eoss/proposals/improving-usability-and-sustainability-for-numpy-and-openblas/) (2020-2021) +* Sovereign Tech Fund funding: [Keeping high performance linear algebra computation accessible and open for all](https://www.sovereigntechfund.de/tech/openblas) (2023-2024) + +Over the course of OpenBLAS development, a number of donations were received. +You can read OpenBLAS's statement of receipts and disbursement and cash balance in +[this Google doc](https://docs.google.com/spreadsheet/ccc?key=0AghkTjXe2lDndE1UZml0dGpaUzJmZGhvenBZd1F2R1E&usp=sharing) (covers 2013-2016). +A list of backers is available [in BACKERS.md](https://github.com/OpenMathLib/OpenBLAS/blob/develop/BACKERS.md) in the main repo. + +### Donations + +We welcome hardware donations, including the latest CPUs and motherboards. + + +## Open source users of OpenBLAS + +Prominent open source users of OpenBLAS include: + +* [Julia](https://julialang.org) - a high-level, high-performance dynamic programming language for technical computing +* [NumPy](https://numpy.org) - the fundamental package for scientific computing with Python +* [SciPy](https://scipy.org) - fundamental algorithms for scientific computing in Python +* [R](https://www.r-project.org/) - a free software environment for statistical computing and graphics +* [OpenCV](https://opencv.org/) - the world's biggest computer vision library + +OpenBLAS is packaged in most major Linux distros, as well as general and +numerical computing-focused packaging ecosystems like Nix, Homebrew, Spack and +conda-forge. + +OpenBLAS is used directly by libraries written in C, C++ and Fortran (and +probably other languages), and directly by end users in those languages. + + +## Publications + +### 2013 + +* Wang Qian, Zhang Xianyi, Zhang Yunquan, Qing Yi, **AUGEM: Automatically Generate High Performance Dense Linear Algebra Kernels on x86 CPUs**, In the International Conference for High Performance Computing, Networking, Storage and Analysis (SC'13), Denver CO, November 2013. [[pdf](http://xianyi.github.io/paper/augem_SC13.pdf)] + +### 2012 + +* Zhang Xianyi, Wang Qian, Zhang Yunquan, **Model-driven Level 3 BLAS Performance Optimization on Loongson 3A Processor**, 2012 IEEE 18th International Conference on Parallel and Distributed Systems (ICPADS), 17-19 Dec. 2012. diff --git a/docs/build_system.md b/docs/build_system.md new file mode 100644 index 00000000..3de22058 --- /dev/null +++ b/docs/build_system.md @@ -0,0 +1,120 @@ +This page describes the Make-based build, which is the default/authoritative +build method. Note that the OpenBLAS repository also supports building with +CMake (not described here) - that generally works and is tested, however there +may be small differences between the Make and CMake builds. + +!!! warning + This page is made by someone who is not the developer and should not be considered as an official documentation of the build system. For getting the full picture, it is best to read the Makefiles and understand them yourself. + +## Makefile dep graph + +``` +Makefile +| +|----- Makefile.system # !!! this is included by many of the Makefiles in the subdirectories !!! +| | +| |===== Makefile.prebuild # This is triggered (not included) once by Makefile.system +| | | # and runs before any of the actual library code is built. +| | | # (builds and runs the "getarch" tool for cpu identification, +| | | # runs the compiler detection scripts c_check and f_check) +| | | +| | ----- (Makefile.conf) [ either this or Makefile_kernel.conf is generated ] +| | | { Makefile.system#L243 } +| | ----- (Makefile_kernel.conf) [ temporary Makefile.conf during DYNAMIC_ARCH builds ] +| | +| |----- Makefile.rule # defaults for build options that can be given on the make command line +| | +| |----- Makefile.$(ARCH) # architecture-specific compiler options and OpenBLAS buffer size values +| +|~~~~~ exports/ +| +|~~~~~ test/ +| +|~~~~~ utest/ +| +|~~~~~ ctest/ +| +|~~~~~ cpp_thread_test/ +| +|~~~~~ kernel/ +| +|~~~~~ ${SUBDIRS} +| +|~~~~~ ${BLASDIRS} +| +|~~~~~ ${NETLIB_LAPACK_DIR}{,/timing,/testing/{EIG,LIN}} +| +|~~~~~ relapack/ +``` + +## Important Variables + +Most of the tunable variables are found in [Makefile.rule](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.rule), along with their detailed descriptions.
+Most of the variables are detected automatically in [Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild), if they are not set in the environment. + +### CPU related +``` +ARCH - Target architecture (eg. x86_64) +TARGET - Target CPU architecture, in case of DYNAMIC_ARCH=1 means library will not be usable on less capable CPUs +TARGET_CORE - TARGET_CORE will override TARGET internally during each cpu-specific cycle of the build for DYNAMIC_ARCH +DYNAMIC_ARCH - For building library for multiple TARGETs (does not lose any optimizations, but increases library size) +DYNAMIC_LIST - optional user-provided subset of the DYNAMIC_CORE list in Makefile.system +``` + +### Toolchain related +``` +CC - TARGET C compiler used for compilation (can be cross-toolchains) +FC - TARGET Fortran compiler used for compilation (can be cross-toolchains, set NOFORTRAN=1 if used cross-toolchain has no fortran compiler) +AR, AS, LD, RANLIB - TARGET toolchain helpers used for compilation (can be cross-toolchains) + +HOSTCC - compiler of build machine, needed to create proper config files for target architecture +HOST_CFLAGS - flags for build machine compiler +``` + +### Library related +``` +BINARY - 32/64 bit library + +BUILD_SHARED - Create shared library +BUILD_STATIC - Create static library + +QUAD_PRECISION - enable support for IEEE quad precision [ largely unimplemented leftover from GotoBLAS, do not use ] +EXPRECISION - Obsolete option to use float80 of SSE on BSD-like systems +INTERFACE64 - Build with 64bit integer representations to support large array index values [ incompatible with standard API ] + +BUILD_SINGLE - build the single-precision real functions of BLAS [and optionally LAPACK] +BUILD_DOUBLE - build the double-precision real functions +BUILD_COMPLEX - build the single-precision complex functions +BUILD_COMPLEX16 - build the double-precision complex functions +(all four types are included in the build by default when none was specifically selected) + +BUILD_BFLOAT16 - build the "half precision brainfloat" real functions + +USE_THREAD - Use a multithreading backend (default to pthread) +USE_LOCKING - implement locking for thread safety even when USE_THREAD is not set (so that the singlethreaded library can + safely be called from multithreaded programs) +USE_OPENMP - Use OpenMP as multithreading backend +NUM_THREADS - define this to the maximum number of parallel threads you expect to need (defaults to the number of cores in the build cpu) +NUM_PARALLEL - define this to the number of OpenMP instances that your code may use for parallel calls into OpenBLAS (default 1,see below) + +``` + + +OpenBLAS uses a fixed set of memory buffers internally, used for communicating +and compiling partial results from individual threads. For efficiency, the +management array structure for these buffers is sized at build time - this +makes it necessary to know in advance how many threads need to be supported on +the target system(s). + +With OpenMP, there is an additional level of complexity as there may be calls +originating from a parallel region in the calling program. If OpenBLAS gets +called from a single parallel region, it runs single-threaded automatically to +avoid overloading the system by fanning out its own set of threads. In the case +that an OpenMP program makes multiple calls from independent regions or +instances in parallel, this default serialization is not sufficient as the +additional caller(s) would compete for the original set of buffers already in +use by the first call. So if multiple OpenMP runtimes call into OpenBLAS at the +same time, then only one of them will be able to make progress while all the +rest of them spin-wait for the one available buffer. Setting `NUM_PARALLEL` to +the upper bound on the number of OpenMP runtimes that you can have in a process +ensures that there are a sufficient number of buffer sets available. diff --git a/docs/ci.md b/docs/ci.md new file mode 100644 index 00000000..6d6112cb --- /dev/null +++ b/docs/ci.md @@ -0,0 +1,56 @@ +# CI jobs + +| Arch|Target CPU|OS|Build system|XComp to|C Compiler|Fortran Compiler|threading|DYN_ARCH|INT64|Libraries| CI Provider| CPU count| +| ------------|---|---|-----------|-------------|----------|----------------|------|------------|----------|-----------|----------|-------| +| x86_64 |Intel 32bit|Windows|CMAKE/VS2015| -|mingw6.3| - | pthreads | - | - | static | Appveyor| | +| x86_64 |Intel |Windows|CMAKE/VS2015| -|mingw5.3| - | pthreads | - | - | static | Appveyor| | +| x86_64 |Intel |Centos5|gmake | -|gcc 4.8 |gfortran| pthreads | + | - | both | Azure | | +| x86_64 |SDE (SkylakeX)|Ubuntu| CMAKE| - | gcc | gfortran | pthreads | - | - | both | Azure | | +| x86_64 |Haswell/ SkylakeX|Windows|CMAKE/VS2017| - | VS2017| - | | - | - | static | Azure | | +| x86_64 | " | Windows|mingw32-make| - |gcc | gfortran | | list | - | both | Azure | | +| x86_64 | " |Windows|CMAKE/Ninja| - |LLVM | - | | - | - | static | Azure | | +| x86_64 | " |Windows|CMAKE/Ninja| - |LLVM | flang | | - | - | static | Azure | | +| x86_64 | " |Windows|CMAKE/Ninja| - |VS2022| flang* | | - | - | static | Azure | | +| x86_64 | " |macOS11|gmake | - | gcc-10|gfortran| OpenMP | + | - | both | Azure | | +| x86_64 | " |macOS11|gmake | - | gcc-10|gfortran| none | - | - | both | Azure | | +| x86_64 | " |macOS12|gmake | - | gcc-12|gfortran|pthreads| - | - | both | Azure | | +| x86_64 | " |macOS11|gmake | - | llvm | - | OpenMP | + | - | both | Azure | | +| x86_64 | " |macOS11|CMAKE | - | llvm | - | OpenMP | no_avx512 | - | static | Azure | | +| x86_64 | " |macOS11|CMAKE | - | gcc-10| gfortran| pthreads | list | - | shared | Azure | | +| x86_64 | " |macOS11|gmake | - | llvm | ifort | pthreads | - | - | both | Azure | | +| x86_64 | " |macOS11|gmake |arm| AndroidNDK-llvm | - | | - | - | both | Azure | | +| x86_64 | " |macOS11|gmake |arm64| XCode 12.4 | - | | + | - | both | Azure | | +| x86_64 | " |macOS11|gmake |arm | XCode 12.4 | - | | + | - | both | Azure | | +| x86_64 | " |Alpine Linux(musl)|gmake| - | gcc | gfortran | pthreads | + | - | both | Azure | | +| arm64 |Apple M1 |OSX |CMAKE/XCode| - | LLVM | - | OpenMP | - | - | static | Cirrus | | +| arm64 |Apple M1 |OSX |CMAKE/Xcode| - | LLVM | - | OpenMP | - | + | static | Cirrus | | +| arm64 |Apple M1 |OSX |CMAKE/XCode|x86_64| LLVM| - | - | + | - | static | Cirrus | | +| arm64 |Neoverse N1|Linux |gmake | - |gcc10.2| -| pthreads| - | - | both | Cirrus | | +| arm64 |Neoverse N1|Linux |gmake | - |gcc10.2| -| pthreads| - | + | both | Cirrus | | +| arm64 |Neoverse N1|Linux |gmake |- |gcc10.2| -| OpenMP | - | - | both |Cirrus | 8 | +| x86_64 | Ryzen| FreeBSD |gmake | - | gcc12.2|gfortran| pthreads| - | - | both | Cirrus | | +| x86_64 | Ryzen| FreeBSD |gmake | | gcc12.2|gfortran| pthreads| - | + | both | Cirrus | | +| x86_64 |GENERIC |QEMU |gmake| mips64 | gcc | gfortran | pthreads | - | - | static | Github | | +| x86_64 |SICORTEX |QEMU |gmake| mips64 | gcc | gfortran | pthreads | - | - | static | Github | | +| x86_64 |I6400 |QEMU |gmake| mips64 | gcc | gfortran | pthreads | - | - | static | Github | | +| x86_64 |P6600 |QEMU |gmake| mips64 | gcc | gfortran | pthreads | - | - | static | Github | | +| x86_64 |I6500 |QEMU |gmake| mips64 | gcc | gfortran | pthreads | - | - | static | Github | | +| x86_64 |Intel |Ubuntu |CMAKE| - | gcc-11.3 | gfortran | pthreads | + | - | static | Github | | +| x86_64 |Intel |Ubuntu |gmake| - | gcc-11.3 | gfortran | pthreads | + | - | both | Github | | +| x86_64 |Intel |Ubuntu |CMAKE| - | gcc-11.3 | flang-classic | pthreads | + | - | static | Github | | +| x86_64 |Intel |Ubuntu |gmake| - | gcc-11.3 | flang-classic | pthreads | + | - | both | Github | | +| x86_64 |Intel |macOS12 | CMAKE| - | AppleClang 14 | gfortran | pthreads | + | - | static | Github | | +| x86_64 |Intel |macOS12 | gmake| - | AppleClang 14 | gfortran | pthreads | + | - | both | Github | | +| x86_64 |Intel |Windows2022 | CMAKE/Ninja| - | mingw gcc 13 | gfortran | | + | - | static | Github | | +| x86_64 |Intel |Windows2022 | CMAKE/Ninja| - | mingw gcc 13 | gfortran | | + | + | static | Github | | +| x86_64 |Intel 32bit|Windows2022 | CMAKE/Ninja| - | mingw gcc 13 | gfortran | | + | - | static | Github | | +| x86_64 |Intel |Windows2022 | CMAKE/Ninja| - | LLVM 16 | - | | + | - | static | Github | | +| x86_64 |Intel | Windows2022 |CMAKE/Ninja| - | LLVM 16 | - | | + | + | static | Github | | +| x86_64 |Intel | Windows2022 |CMAKE/Ninja| - | gcc 13| - | | + | - | static | Github | | +| x86_64 |Intel| Ubuntu |gmake |mips64|gcc|gfortran|pthreads|+|-|both|Github| | +| x86_64 |generic|Ubuntu |gmake |riscv64|gcc|gfortran|pthreads|-|-|both|Github| | +| x86_64 |Intel|Ubuntu |gmake |mips32|gcc|gfortran|pthreads|-|-|both|Github | | +| x86_64 |Intel|Ubuntu |gmake |ia64|gcc|gfortran|pthreads|-|-|both|Github| | +| x86_64 |C910V|QEmu |gmake |riscv64|gcc|gfortran|pthreads|-|-|both|Github| | +|power |pwr9| Ubuntu |gmake | - |gcc|gfortran|OpenMP|-|-|both|OSUOSL| | +|zarch |z14 | Ubuntu |gmake | - |gcc|gfortran|OpenMP|-|-|both|OSUOSL| | diff --git a/docs/developers.md b/docs/developers.md new file mode 100644 index 00000000..b2c62eeb --- /dev/null +++ b/docs/developers.md @@ -0,0 +1,192 @@ +# Developer manual + +## Source code layout + +``` +OpenBLAS/ +├── benchmark Benchmark codes for BLAS +├── cmake CMakefiles +├── ctest Test codes for CBLAS interfaces +├── driver Implemented in C +│   ├── level2 +│   ├── level3 +│   ├── mapper +│   └── others Memory management, threading, etc +├── exports Generate shared library +├── interface Implement BLAS and CBLAS interfaces (calling driver or kernel) +│   ├── lapack +│   └── netlib +├── kernel Optimized assembly kernels for CPU architectures +│   ├── alpha Original GotoBLAS kernels for DEC Alpha +│   ├── arm ARMV5,V6,V7 kernels (including generic C codes used by other architectures) +│   ├── arm64 ARMV8 +│   ├── generic General kernel codes written in plain C, parts used by many architectures. +│   ├── ia64 Original GotoBLAS kernels for Intel Itanium +│ ├── mips +│   ├── mips64 +│   ├── power +| ├── riscv64 +| ├── simd Common code for Universal Intrinsics, used by some x86_64 and arm64 kernels +│   ├── sparc +│   ├── x86 +│ ├── x86_64 +│   └── zarch +├── lapack Optimized LAPACK codes (replacing those in regular LAPACK) +│   ├── getf2 +│   ├── getrf +│   ├── getrs +│   ├── laswp +│   ├── lauu2 +│   ├── lauum +│   ├── potf2 +│   ├── potrf +│   ├── trti2 +│ ├── trtri +│   └── trtrs +├── lapack-netlib LAPACK codes from netlib reference implementation +├── reference BLAS Fortran reference implementation (unused) +├── relapack Elmar Peise's recursive LAPACK (implemented on top of regular LAPACK) +├── test Test codes for BLAS +└── utest Regression test + +``` + +A call tree for `dgemm` looks as follows: +``` +interface/gemm.c + │ +driver/level3/level3.c + │ +gemm assembly kernels at kernel/ +``` + +To find the kernel currently used for a particular supported CPU, please check the corresponding `kernel/$(ARCH)/KERNEL.$(CPU)` file. + +Here is an example for `kernel/x86_64/KERNEL.HASWELL`: +``` +... +DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c +DGEMMKERNEL = dgemm_kernel_4x8_haswell.S +... +``` +According to the above `KERNEL.HASWELL`, OpenBLAS Haswell dgemm kernel file is `dgemm_kernel_4x8_haswell.S`. + + +## Optimizing GEMM for a given hardware + +!!! abstract "Read the Goto paper to understand the algorithm" + + Goto, Kazushige; van de Geijn, Robert A. (2008). + ["Anatomy of High-Performance Matrix Multiplication"](http://delivery.acm.org/10.1145/1360000/1356053/a12-goto.pdf?ip=155.68.162.54&id=1356053&acc=ACTIVE%20SERVICE&key=A79D83B43E50B5B8%2EF070BBE7E45C3F17%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517932837_edfe766f1e295d9a7830812371e1d173). + ACM Transactions on Mathematical Software 34 (3): Article 12 + + (The above link is available only to ACM members, but this and many related + papers is also available on [the pages of van de Geijn's FLAME project](http://www.cs.utexas.edu/~flame/web/FLAMEPublications.html)) + +The `driver/level3/level3.c` is the implementation of Goto's algorithm. +Meanwhile, you can look at `kernel/generic/gemmkernel_2x2.c`, which is a naive +`2x2` register blocking `gemm` kernel in C. Then: + +* Write optimized assembly kernels. Consider instruction pipeline, available registers, memory/cache access. +* Tune cache block sizes (`Mc`, `Kc`, and `Nc`) + +Note that not all of the CPU-specific parameters in `param.h` are actively used in algorithms. +`DNUMOPT` only appears as a scale factor in profiling output of the level3 `syrk` interface code, +while its counterpart `SNUMOPT` (aliased as `NUMOPT` in `common.h`) is not used anywhere at all. + +`SYMV_P` is only used in the generic kernels for the `symv` and `chemv`/`zhemv` functions - +at least some of those are usually overridden by CPU-specific implementations, so if you start +by cloning the existing implementation for a related CPU you need to check its `KERNEL` file +to see if tuning `SYMV_P` would have any effect at all. + +`GEMV_UNROLL` is only used by some older x86-64 kernels, so not all sections in `param.h` define it. +Similarly, not all of the CPU parameters like L2 or L3 cache sizes are necessarily used in current +kernels for a given model - by all indications the CPU identification code was imported from some +other project originally. + + +## Running OpenBLAS tests + +We use tests for Netlib BLAS, CBLAS, and LAPACK. In addition, we use +OpenBLAS-specific regression tests. They can be run with Make: + +* `make -C test` for BLAS tests +* `make -C ctest` for CBLAS tests +* `make -C utest` for OpenBLAS regression tests +* `make lapack-test` for LAPACK tests + +We also use the [BLAS-Tester](https://github.com/xianyi/BLAS-Tester) tests for regression testing. +It is basically the ATLAS test suite adapted for building with OpenBLAS. + +The project makes use of several Continuous Integration (CI) services +conveniently interfaced with GitHub to automatically run tests on a number of +platforms and build configurations. + +Also note that the test suites included with "numerically heavy" projects like +Julia, NumPy, SciPy, Octave or QuantumEspresso can be used for regression +testing, when those projects are built such that they use OpenBLAS. + + +## Benchmarking + +A number of benchmarking methods are used by OpenBLAS: + +- Several simple C benchmarks for performance testing individual BLAS functions + are available in the `benchmark` folder. They can be run locally through the + `Makefile` in that directory. And the `benchmark/scripts` subdirectory + contains similar benchmarks that use OpenBLAS via NumPy, SciPy, Octave and R. +- On pull requests, a representative set of functions is tested for performance + regressions with Codspeed; results can be viewed at + [https://codspeed.io/OpenMathLib/OpenBLAS](https://codspeed.io/OpenMathLib/OpenBLAS). +- The [OpenMathLib/BLAS-Benchmarks](https://github.com/OpenMathLib/BLAS-Benchmarks) repository + contains an [Airspeed Velocity](https://github.com/airspeed-velocity/asv/)-based benchmark + suite which is run on several CPU architectures in cron jobs. Results are published + to a dashboard: [http://www.openmathlib.org/BLAS-Benchmarks/](http://www.openmathlib.org/BLAS-Benchmarks/). + +Benchmarking code for BLAS libraries, and specific performance analysis results, can be found +in a number of places. For example: + +* [MatlabJuliaMatrixOperationsBenchmark](https://github.com/RoyiAvital/MatlabJuliaMatrixOperationsBenchmark) + (various matrix operations in Julia and Matlab) +* [mmperf/mmperf](https://github.com/mmperf/mmperf/) (single-core matrix multiplication) + + +## Adding autodetection support for a new revision or variant of a supported CPU + +Especially relevant for x86-64, a new CPU model may be a "refresh" (die shrink and/or different number of cores) within an existing +model family without significant changes to its instruction set (e.g., Intel Skylake and Kaby Lake still are fundamentally the same architecture as Haswell, +low end Goldmont etc. are Nehalem). In this case, compilation with the appropriate older `TARGET` will already lead to a satisfactory build. + +To achieve autodetection of the new model, its CPUID (or an equivalent identifier) needs to be added in the `cpuid_.c` +relevant for its general architecture, with the returned name for the new type set appropriately. For x86, which has the most complex +`cpuid` file, there are two functions that need to be edited: `get_cpuname()` to return, e.g., `CPUTYPE_HASWELL` and `get_corename()` for the (broader) +core family returning, e.g., `CORE_HASWELL`.[^1] + +[^1]: + This information ends up in the `Makefile.conf` and `config.h` files generated by `getarch`. Failure to + set either will typically lead to a missing definition of the `GEMM_UNROLL` parameters later in the build, + as `getarch_2nd` will be unable to find a matching parameter section in `param.h`. + +For architectures where `DYNAMIC_ARCH` builds are supported, a similar but simpler code section for the corresponding +runtime detection of the CPU exists in `driver/others/dynamic.c` (for x86), and `driver/others/dynamic_.c` for other architectures. +Note that for x86 the CPUID is compared after splitting it into its family, extended family, model and extended model parts, so the single decimal +number returned by Linux in `/proc/cpuinfo` for the model has to be converted back to hexadecimal before splitting into its constituent +digits. For example, `142 == 8E` translates to extended model 8, model 14. + + +## Adding dedicated support for a new CPU model + +Usually it will be possible to start from an existing model, clone its `KERNEL` configuration file to the new name to use for this +`TARGET` and eventually replace individual kernels with versions better suited for peculiarities of the new CPU model. +In addition, it is necessary to add (or clone at first) the corresponding section of `GEMM_UNROLL` parameters in the top-level `param.h`, +and possibly to add definitions such as `USE_TRMM` (governing whether `TRMM` functions use the respective `GEMM` kernel or a separate source file) +to the `Makefile`s (and `CMakeLists.txt`) in the kernel directory. The new CPU name needs to be added to `TargetList.txt`, +and the CPU auto-detection code used by the `getarch` helper program - contained in +the `cpuid_.c` file amended to include the CPUID (or equivalent) information processing required (see preceding section). + + +## Adding support for an entirely new architecture + +This endeavour is best started by cloning the entire support structure for 32-bit ARM, and within that the ARMv5 CPU in particular, +as this is implemented through plain C kernels only. An example providing a convenient "shopping list" can be seen in pull request +[#1526](https://github.com/OpenMathLib/OpenBLAS/pull/1526). diff --git a/docs/distributing.md b/docs/distributing.md index 1e6372a2..98b390a9 100644 --- a/docs/distributing.md +++ b/docs/distributing.md @@ -1,11 +1,12 @@ -# Guidance for redistributing OpenBLAS +# Redistributing OpenBLAS -*We note that this document contains recommendations only - packagers and other -redistributors are in charge of how OpenBLAS is built and distributed in their -systems, and may have good reasons to deviate from the guidance given on this -page. These recommendations are aimed at general packaging systems, with a user -base that typically is large, open source (or freely available at least), and -doesn't behave uniformly or that the packager is directly connected with.* +!!! note + This document contains recommendations only - packagers and other + redistributors are in charge of how OpenBLAS is built and distributed in their + systems, and may have good reasons to deviate from the guidance given on this + page. These recommendations are aimed at general packaging systems, with a user + base that typically is large, open source (or freely available at least), and + doesn't behave uniformly or that the packager is directly connected with.* OpenBLAS has a large number of build-time options which can be used to change how it behaves at runtime, how artifacts or symbols are named, etc. Variation @@ -48,7 +49,7 @@ settings): to provide an ILP64 interface build as well, use a symbol suffix to avoid symbol name clashes (see the next section). -[^1] All major distributions do include LAPACK as of mid 2023 as far as we +[^1]: All major distributions do include LAPACK as of mid 2023 as far as we know. Older versions of Arch Linux did not, and that was known to cause problems. diff --git a/docs/extensions.md b/docs/extensions.md new file mode 100644 index 00000000..483b0092 --- /dev/null +++ b/docs/extensions.md @@ -0,0 +1,39 @@ +OpenBLAS for the most part contains implementations of the reference (Netlib) +BLAS, CBLAS, LAPACK and LAPACKE interfaces. A few OpenBLAS-specific functions +are also provided however, which mostly can be seen as "BLAS extensions". +This page documents those non-standard APIs. + +## BLAS-like extensions + +| Routine | Data Types | Description | +| ------------- |:------------- | :---------------| +| ?axpby | s,d,c,z | like axpy with a multiplier for y | +| ?gemm3m | c,z | gemm3m | +| ?imatcopy | s,d,c,z | in-place transpositon/copying | +| ?omatcopy | s,d,c,z | out-of-place transpositon/copying | +| ?geadd | s,d,c,z | matrix add | +| ?gemmt | s,d,c,z | gemm but only a triangular part updated| + + +## bfloat16 functionality + +BLAS-like and conversion functions for `bfloat16` (available when OpenBLAS was compiled with `BUILD_BFLOAT16=1`): + +* `void cblas_sbstobf16` converts a float array to an array of bfloat16 values by rounding +* `void cblas_sbdtobf16` converts a double array to an array of bfloat16 values by rounding +* `void cblas_sbf16tos` converts a bfloat16 array to an array of floats +* `void cblas_dbf16tod` converts a bfloat16 array to an array of doubles +* `float cblas_sbdot` computes the dot product of two bfloat16 arrays +* `void cblas_sbgemv` performs the matrix-vector operations of GEMV with the input matrix and X vector as bfloat16 +* `void cblas_sbgemm` performs the matrix-matrix operations of GEMM with both input arrays containing bfloat16 + +## Utility functions + +* `openblas_get_num_threads` +* `openblas_set_num_threads` +* `int openblas_get_num_procs(void)` returns the number of processors available on the system (may include "hyperthreading cores") +* `int openblas_get_parallel(void)` returns 0 for sequential use, 1 for platform-based threading and 2 for OpenMP-based threading +* `char * openblas_get_config()` returns the options OpenBLAS was built with, something like `NO_LAPACKE DYNAMIC_ARCH NO_AFFINITY Haswell` +* `int openblas_set_affinity(int thread_index, size_t cpusetsize, cpu_set_t *cpuset)` sets the CPU affinity mask of the given thread + to the provided cpuset. Only available on Linux, with semantics identical to `pthread_setaffinity_np`. + diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 00000000..699042d5 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,345 @@ +--- +title: FAQ +--- + + + +[TOC] + +## General questions + +### What is BLAS? Why is it important? + +[BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) stands for Basic Linear Algebra Subprograms. BLAS provides standard interfaces for [linear algebra](https://en.wikipedia.org/wiki/Linear_algebra), including BLAS1 (vector-vector operations), BLAS2 (matrix-vector operations), and BLAS3 (matrix-matrix operations). In general, BLAS is the computational kernel ("the bottom of the food chain") in linear algebra or scientific applications. Thus, if BLAS implementation is highly optimized, the whole application can get substantial benefit. + +### What functions are there and how can I call them from my C code? + +As BLAS is a standardized interface, you can refer to the documentation of its reference implementation at [netlib.org](http://netlib.org/blas/index.html#_blas_routines). Calls from C go through its CBLAS interface, +so your code will need to include the provided cblas.h in addition to linking with -lopenblas. +A single-precision matrix multiplication will look like +``` +#include +... +cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, A, K, B, N, 0.0, result, N); +``` +where M,N,K are the dimensions of your data - see https://petewarden.files.wordpress.com/2015/04/gemm_corrected.png + (This image is part of an article on GEMM in the context of deep learning that is well worth reading in full - +https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/) + +### What is OpenBLAS? Why did you create this project? + +OpenBLAS is an open source BLAS library forked from the GotoBLAS2-1.13 BSD version. Since Mr. Kazushige Goto left TACC, GotoBLAS is no longer being maintained. Thus, we created this project to continue developing OpenBLAS/GotoBLAS. + +### What's the difference between OpenBLAS and GotoBLAS? + +In OpenBLAS 0.2.0, we optimized level 3 BLAS on the Intel Sandy Bridge 64-bit OS. We obtained a performance comparable with that Intel MKL. + +We optimized level 3 BLAS performance on the [ICT Loongson-3A](http://en.wikipedia.org/wiki/Loongson) CPU. It outperformed GotoBLAS by 135% in a single thread and 120% in 4 threads. + +We fixed some GotoBLAS bugs including a SEGFAULT bug on the new Linux kernel, MingW32/64 bugs, and a ztrmm computing error bug on Intel Nehalem. + +We also added some minor features, e.g. supporting "make install", compiling without LAPACK and upgrading the LAPACK version to 3.4.2. + +You can find the full list of modifications in Changelog.txt. + +### Where do parameters GEMM_P, GEMM_Q, GEMM_R come from? + +The detailed explanation is probably in the original publication authored by Kazushige Goto - Goto, Kazushige; van de Geijn, Robert A; Anatomy of high-performance matrix multiplication. ACM Transactions on Mathematical Software (TOMS). Volume 34 Issue 3, May 2008 +While this article is paywalled and too old for preprints to be available on arxiv.org, more recent +publications like https://arxiv.org/pdf/1609.00076 contain at least a brief description of the algorithm. +In practice, the values are derived by experimentation to yield the block sizes that give the highest performance. A general rule of thumb for selecting a starting point seems to be that PxQ is about half the size of L2 cache. + +### How can I report a bug? + +Please file an issue at this [issue page](https://github.com/xianyi/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users). + +Please provide the following information: CPU, OS, compiler, and OpenBLAS compiling flags (Makefile.rule). In addition, please describe how to reproduce this bug. + +### How to reference OpenBLAS. + +You can reference our papers in [this page](about.md#publications). Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net. + +### How can I use OpenBLAS in multi-threaded applications? + +If your application is already multi-threaded, it will conflict with OpenBLAS multi-threading. Thus, you must set OpenBLAS to use single thread as following. + + * export OPENBLAS_NUM_THREADS=1 in the environment variables. +Or + * Call openblas_set_num_threads(1) in the application on runtime. +Or + * Build OpenBLAS single thread version, e.g. make USE_THREAD=0 USE_LOCKING=1 (see comment below) + +If the application is parallelized by OpenMP, please build OpenBLAS with USE_OPENMP=1 + +With the increased availability of fast multicore hardware it has unfortunately become clear that the thread management provided by OpenMP is not sufficient to prevent race conditions when OpenBLAS was built single-threaded by USE_THREAD=0 and there are concurrent calls from multiple threads to OpenBLAS functions. In this case, +it is vital to also specify USE_LOCKING=1 (introduced with OpenBLAS 0.3.7). + +### Does OpenBLAS support sparse matrices and/or vectors ? + +OpenBLAS implements only the standard (dense) BLAS and LAPACK functions with a select few extensions popularized by Intel's MKL. Some +cases can probably be made to work using e.g. GEMV or AXPBY, in general using a dedicated package like SuiteSparse (which can make use of OpenBLAS or equivalent for standard operations) is recommended. + +### What support is there for recent PC hardware ? What about GPU ? + + As OpenBLAS is a volunteer project, it can take some time for the combination of a capable developer, +free time, and particular hardware to come along, even for relatively common processors. Starting from 0.3.1, support +is being added for AVX 512 (TARGET=SKYLAKEX), requiring a compiler that is capable of handling avx512 intrinsics. +While AMD Zen processors should be autodetected by the build system, as of 0.3.2 they are still handled exactly +like Intel Haswell. There once was an effort to build an OpenCL implementation that one can still find at https://github.com/xianyi/clOpenBLAS , but work on this stopped in 2015. + +### How about the level 3 BLAS performance on Intel Sandy Bridge? + +We obtained a performance comparable with Intel MKL that actually outperformed Intel MKL in some cases. +Here is the result of the DGEMM subroutine's performance on Intel Core i5-2500K Windows 7 SP1 64-bit: +![Single Thread DGEMM Performance on Intel Desktop Sandy Bridge](http://xianyi.github.com/OpenBLAS/dgemm_snb_1thread.png) + +


+ +## OS and Compiler + +### How can I call an OpenBLAS function in Microsoft Visual Studio? + +Please read [this page](install.md#visual-studio). + +### How can I use CBLAS and LAPACKE without C99 complex number support (e.g. in Visual Studio)? + +Zaheer has fixed this bug. You can now use the structure instead of C99 complex numbers. Please read [this issue page](http://github.com/xianyi/OpenBLAS/issues/95) for details. + +[This issue](https://github.com/xianyi/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio. + +### I get a SEGFAULT with multi-threading on Linux. What's wrong? + +This may be related to a bug in the Linux kernel 2.6.32 (?). Try applying the patch segaults.patch to disable mbind using + + patch < segfaults.patch + +and see if the crashes persist. Note that this patch will lead to many compiler warnings. + +### When I make the library, there is no such instruction: `xgetbv' error. What's wrong? + +Please use GCC 4.4 and later version. This version supports xgetbv instruction. If you use the library for Sandy Bridge with AVX instructions, you should use GCC 4.6 and later version. + +On Mac OS X, please use Clang 3.1 and later version. For example, make CC=clang + +For the compatibility with old compilers (GCC < 4.4), you can enable NO_AVX flag. For example, make NO_AVX=1 + +### My build fails due to the linker error "multiple definition of `dlamc3_'". What is the problem? + +This linker error occurs if GNU patch is missing or if our patch for LAPACK fails to apply. + +Background: OpenBLAS implements optimized versions of some LAPACK functions, so we need to disable the reference versions. If this process fails we end with duplicated implementations of the same function. + +### My build worked fine and passed all tests, but running `make lapack-test` ends with segfaults + +Some of the LAPACK tests, notably in xeigtstz, try to allocate around 10MB on the stack. You may need to use +`ulimit -s` to change the default limits on your system to allow this. + +### How could I disable OpenBLAS threading affinity on runtime? + +You can define the OPENBLAS_MAIN_FREE or GOTOBLAS_MAIN_FREE environment variable to disable threading affinity on runtime. For example, before the running, +``` +export OPENBLAS_MAIN_FREE=1 +``` + +Alternatively, you can disable affinity feature with enabling NO_AFFINITY=1 in Makefile.rule. + +### How to solve undefined reference errors when statically linking against libopenblas.a + +On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by default), custom programs statically linked against `libopenblas.a` should also link to the pthread library e.g.: + +``` +gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread +``` + +Failing to add the `-lpthread` flag will cause errors such as: + +``` +/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory': +memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock' +memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock' +/opt/OpenBLAS/libopenblas.a(memory.o): In function `openblas_fork_handler': +memory.c:(.text+0x440): undefined reference to `pthread_atfork' +/opt/OpenBLAS/libopenblas.a(memory.o): In function `blas_memory_alloc': +memory.c:(.text+0x7a5): undefined reference to `pthread_mutex_lock' +memory.c:(.text+0x825): undefined reference to `pthread_mutex_unlock' +/opt/OpenBLAS/libopenblas.a(memory.o): In function `blas_shutdown': +memory.c:(.text+0x9e1): undefined reference to `pthread_mutex_lock' +memory.c:(.text+0xa6e): undefined reference to `pthread_mutex_unlock' +/opt/OpenBLAS/libopenblas.a(blas_server.o): In function `blas_thread_server': +blas_server.c:(.text+0x273): undefined reference to `pthread_mutex_lock' +blas_server.c:(.text+0x287): undefined reference to `pthread_mutex_unlock' +blas_server.c:(.text+0x33f): undefined reference to `pthread_cond_wait' +/opt/OpenBLAS/libopenblas.a(blas_server.o): In function `blas_thread_init': +blas_server.c:(.text+0x416): undefined reference to `pthread_mutex_lock' +blas_server.c:(.text+0x4be): undefined reference to `pthread_mutex_init' +blas_server.c:(.text+0x4ca): undefined reference to `pthread_cond_init' +blas_server.c:(.text+0x4e0): undefined reference to `pthread_create' +blas_server.c:(.text+0x50f): undefined reference to `pthread_mutex_unlock' +... +``` + +The `-lpthread` is not required when linking dynamically against `libopenblas.so.0`. + +### Building OpenBLAS for Haswell or Dynamic Arch on RHEL-6, CentOS-6, Rocks-6.1,Scientific Linux 6 + +Minimum requirement to actually run AVX2-enabled software like OpenBLAS is kernel-2.6.32-358, shipped with EL6U4 in 2013 + +The `binutils` package from RHEL6 does not know the instruction `vpermpd` or any other AVX2 instruction. You can download a newer `binutils` package from Enterprise Linux software collections, following instructions here:
+https://www.softwarecollections.org/en/scls/rhscl/devtoolset-3/
+After configuring repository you need to install devtoolset-?-binutils to get later usable binutils package +``` +$ yum search devtoolset-\?-binutils +$ sudo yum install devtoolset-3-binutils +``` +once packages are installed check the correct name for SCL redirection set to enable new version +``` +$ scl --list +devtoolset-3 +rh-python35 +``` +Now just prefix your build commands with respective redirection: +``` +$ scl enable devtoolset-3 -- make DYNAMIC_ARCH=1 +``` +AVX-512 (SKYLAKEX) support requires devtoolset-8-gcc-gfortran (which exceeds formal requirement for AVX-512 because of packaging issues in earlier packages) which dependency-installs respective binutils and gcc or later and kernel 2.6.32-696 aka 6U9 or 3.10.0-327 aka 7U2 or later to run. In absence of abovementioned toolset OpenBLAS will fall back to AVX2 instructions in place of AVX512 sacrificing some performance on SKYLAKE-X platform. + +### Building OpenBLAS in QEMU/KVM/XEN + +By default, QEMU reports the CPU as "QEMU Virtual CPU version 2.2.0", which shares CPUID with existing 32bit CPU even in 64bit virtual machine, and OpenBLAS recognizes it as PENTIUM2. Depending on the exact combination of CPU features the hypervisor choses to expose, this may not correspond to any CPU that exists, and OpenBLAS will error when trying to build. To fix this, pass `-cpu host` or `-cpu passthough` to QEMU, or another CPU model. +Similarly, the XEN hypervisor may not pass through all features of the host cpu while reporting the cpu type itself correctly, which can +lead to compiler error messages about an "ABI change" when compiling AVX512 code. Again changing the Xen configuration by running e.g. +"xen-cmdline --set-xen cpuid=avx512" should get around this (as would building OpenBLAS for an older cpu lacking that particular feature, e.g. TARGET=HASWELL) + +### Building OpenBLAS on POWER fails with IBM XL + + Trying to compile OpenBLAS with IBM XL ends with error messages about unknown register names +like "vs32". Working around these by using known alternate names for the vector registers only leads to another assembler error about unsupported constraints. This is a known deficiency in the IBM compiler at least up to and including 16.1.0 (and in the POWER version of clang, from which it is derived) - use gcc instead. (See issues #1078 +and #1699 for related discussions) + +### Replacing system BLAS/updating APT OpenBLAS in Mint/Ubuntu/Debian + +Debian and Ubuntu LTS versions provide OpenBLAS package which is not updated after initial release, and under circumstances one might want to use more recent version of OpenBLAS e.g. to get support for newer CPUs + +Ubuntu and Debian provides 'alternatives' mechanism to comfortably replace BLAS and LAPACK libraries systemwide. + +After successful build of OpenBLAS (with DYNAMIC_ARCH set to 1) + +``` +$ make clean +$ make DYNAMIC_ARCH=1 +$ sudo make DYNAMIC_ARCH=1 install +``` +One can redirect BLAS and LAPACK alternatives to point to source-built OpenBLAS +First you have to install NetLib LAPACK reference implementation (to have alternatives to replace): +``` +$ sudo apt install libblas-dev liblapack-dev +``` +Then we can set alternative to our freshly-built library: +``` +$ sudo update-alternatives --install /usr/lib/libblas.so.3 libblas.so.3 /opt/OpenBLAS/lib/libopenblas.so.0 41 \ + --slave /usr/lib/liblapack.so.3 liblapack.so.3 /opt/OpenBLAS/lib/libopenblas.so.0 +``` +Or remove redirection and switch back to APT-provided BLAS implementation order: +``` +$ sudo update-alternatives --remove libblas.so.3 /opt/OpenBLAS/lib/libopenblas.so.0 +``` +In recent versions of the distributions, the installation path for the libraries has been changed to include the name of the host architecture, like /usr/lib/x86_64-linux-gnu/blas/libblas.so.3 or libblas.so.3.x86_64-linux-gnu. Use ```$ update-alternatives --display libblas.so.3``` +to find out what layout your system has. + +### I built OpenBLAS for use with some other software, but that software cannot find it + +Openblas installs as a single library named libopenblas.so, while some programs may be searching for a separate libblas.so and liblapack.so so you may need to create appropriate symbolic links (`ln -s libopenblas.so libblas.so; +ln -s libopenblas.so liblapack.so`) or copies. Also make sure that the installation location (usually /opt/OpenBLAS/lib or /usr/local/lib) is among the library search paths of your system. + +### I included cblas.h in my program, but the compiler complains about a missing common.h or functions from it + +You probably tried to include a cblas.h that you simply copied from the OpenBLAS source, instead you need to run +`make install` after building OpenBLAS and then use the modified cblas.h that this step builds in the installation +path (usually either /usr/local/include, /opt/OpenBLAS/include or whatever you specified as PREFIX= on the `make install`) + +### Compiling OpenBLAS with gcc's -fbounds-check actually triggers aborts in programs + +This is due to different interpretations of the (informal) standard for passing characters as arguments between C and FORTRAN functions. As the method for storing text differs in the two languages, when C calls Fortran the text length is passed as an "invisible" additional parameter. +Historically, this has not been required when the text is just a single character, so older code like the Reference-LAPACK bundled with OpenBLAS +does not do it. Recently gcc's checking has changed to require it, but there is no consensus yet if and how the existing LAPACK (and many other codebases) should adapt. (And for actual compilation, gcc has mostly backtracked and provided compatibility options - hence the default build settings in the OpenBLAS Makefiles add -fno-optimize-sibling-calls to the gfortran options to prevent miscompilation with "affected" versions. See ticket 2154 in the issue tracker for more details and links) +
+ +### Build fails with lots of errors about undefined ?GEMM_UNROLL_M + +Your cpu is apparently too new to be recognized by the build scripts, so they failed to assign appropriate parameters for the block algorithm. +Do a `make clean` and try again with TARGET set to one of the cpu models listed in `TargetList.txt` - for x86_64 this will usually be HASWELL. + +### CMAKE/OSX: Build fails with 'argument list too long' + +This is a limitation in the maximum length of a command on OSX, coupled with how CMAKE works. You should be able to work around this +by adding the option `-DCMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS=1` to your CMAKE arguments. + +### Likely problems with AVX2 support in Docker Desktop for OSX + +There have been a few reports of wrong calculation results and build-time test failures when building in a container environment managed by the OSX version of Docker Desktop, which uses the xhyve virtualizer underneath. Judging from these reports, AVX2 support in xhyve appears to be subtly broken but a corresponding ticket in the xhyve issue tracker has not drawn any reaction or comment since 2019. Therefore it is strongly recommended to build OpenBLAS with the NO_AVX2=1 option when inside a container under (or for later use with) the Docker Desktop environment on Intel-based Apple hardware. + +## Usage + +### Program is Terminated. Because you tried to allocate too many memory regions + +In OpenBLAS, we mange a pool of memory buffers and allocate the number of buffers as the following. +``` +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +``` +This error indicates that the program exceeded the number of buffers. + +Please build OpenBLAS with larger `NUM_THREADS`. For example, `make NUM_THREADS=32` or `make NUM_THREADS=64`. +In `Makefile.system`, we will set `MAX_CPU_NUMBER=NUM_THREADS`. + +### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH + +The environment variable which control the kernel selection is `OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) +e.g. `export OPENBLAS_CORETYPE=Haswell`. And the function `char* openblas_get_corename()` returns the used target. + +### After updating the installed OpenBLAS, a program complains about "undefined symbol gotoblas" + +This symbol gets defined only when OpenBLAS is built with "make DYNAMIC_ARCH=1" (which is what distributors will choose to ensure support for more than just one CPU type). + +### How can I find out at runtime what options the library was built with ? + +OpenBLAS has two utility functions that may come in here: + +openblas_get_parallel() will return 0 for a single-threaded library, 1 if multithreading without OpenMP, 2 if built with USE_OPENMP=1 + +openblas_get_config() will return a string containing settings such as USE64BITINT or DYNAMIC_ARCH that were active at build time, as well as the target cpu (or in case of a dynamic_arch build, the currently detected one). + +### After making OpenBLAS, I find that the static library is multithreaded, but the dynamic one is not ? + +The shared OpenBLAS library you built is probably working fine as well, but your program may be picking up a different (probably single-threaded) version from one of the standard system paths like /usr/lib on startup. +Running `ldd /path/to/your/program` will tell you which library the linkage loader will actually use. + +Specifying the "correct" library location with the `-L` flag (like `-L /opt/OpenBLAS/lib`) when linking your program only defines which library will be used to see if all symbols _can_ be resolved, you will need to add an rpath entry to the binary (using `-Wl,rpath=/opt/OpenBLAS/lib`) to make it request searching that location. Alternatively, remove the "wrong old" library (if you can), or set LD_LIBRARY_PATH to the desired location before running your program. + +### I want to use OpenBLAS with CUDA in the HPL 2.3 benchmark code but it keeps looking for Intel MKL + +You need to edit file src/cuda/cuda_dgemm.c in the NVIDIA version of HPL, change the "handle2" and "handle" dlopen calls to use libopenblas.so instead of libmkl_intel_lp64.so, and add an trailing underscore in the dlsym lines for dgemm_mkl and dtrsm_mkl (like `dgemm_mkl = (void(*)())dlsym(handle, “dgemm_”);`) + +### Multithreaded OpenBLAS runs no faster or is even slower than singlethreaded on my ARMV7 board + +The power saving mechanisms of your board may have shut down some cores, making them invisible to OpenBLAS in its startup phase. Try bringing them online before starting your calculation. + +### Speed varies wildly between individual runs on a typical ARMV8 smartphone processor + +Check the technical specifications, it could be that the SoC combines fast and slow cpus and threads can end up on either. In that case, binding the process to specific cores e.g. by setting `OMP_PLACES=cores` may help. (You may need to experiment with OpenMP options, it has been reported that using `OMP_NUM_THREADS=2 OMP_PLACES=cores` caused +a huge drop in performance on a 4+4 core chip while `OMP_NUM_THREADS=2 OMP_PLACES=cores(2)` worked as intended - as did OMP_PLACES=cores with 4 threads) + +### I cannot get OpenBLAS to use more than a small subset of available cores on a big system + +Multithreading support in OpenBLAS requires the use of internal buffers for sharing partial results, the number and size of which is defined at compile time. Unless you specify NUM_THREADS in your make or cmake command, the build scripts try to autodetect the number of cores available in your build host to size the library to match. This unfortunately means that if you move the resulting binary from a small "front-end node" to a larger "compute node" later, it will still be limited to the hardware capabilities of the original system. The solution is to set NUM_THREADS to a number big enough to encompass the biggest systems you expect to run the binary on - at runtime, it will scale down the maximum number of threads it uses to match the number of cores physically available. + +### Getting "ELF load command address/offset not properly aligned" when loading libopenblas.so + +If you get a message "error while loading shared libraries: libopenblas.so.0: ELF load command address/offset not properly aligned" when starting a program that is (dynamically) linked to OpenBLAS, this is very likely due to a bug in the GNU linker (ld) that is part of the +GNU binutils package. This error was specifically observed on older versions of Ubuntu Linux updated with the (at the time) most recent binutils version 2.38, but an internet search turned up sporadic reports involving various other libraries dating back several years. A bugfix was created by the binutils developers and should be available in later versions of binutils.(See issue 3708 for details) + +#### Using OpenBLAS with OpenMP + +OpenMP provides its own locking mechanisms, so when your code makes BLAS/LAPACK calls from inside OpenMP parallel regions it is imperative +that you use an OpenBLAS that is built with USE_OPENMP=1, as otherwise deadlocks might occur. Furthermore, OpenBLAS will automatically restrict itself to using only a single thread when called from an OpenMP parallel region. When it is certain that calls will only occur +from the main thread of your program (i.e. outside of omp parallel constructs), a standard pthreads build of OpenBLAS can be used as well. In that case it may be useful to tune the linger behaviour of idle threads in both your OpenMP program (e.g. set OMP_WAIT_POLICY=passive) and OpenBLAS (by redefining the THREAD_TIMEOUT variable at build time, or setting the environment variable OPENBLAS_THREAD_TIMEOUT smaller than the default 26) so that the two alternating thread pools do not unnecessarily hog the cpu during the handover. + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..3358491d --- /dev/null +++ b/docs/index.md @@ -0,0 +1,11 @@ +## Introduction + +OpenBLAS is an optimized Basic Linear Algebra Subprograms (BLAS) library based on [GotoBLAS2](https://www.tacc.utexas.edu/research-development/tacc-software/gotoblas2) 1.13 BSD version. + +OpenBLAS implements low-level routines for performing linear algebra operations such as vector addition, scalar multiplication, dot products, linear combinations, and matrix multiplication. OpenBLAS makes these routines available on multiple platforms, covering server, desktop and mobile operating systems, as well as different architectures including x86, ARM, MIPS, PPC, RISC-V, and zarch. + +The old GotoBLAS documentation can be found [on GitHub](https://github.com/OpenMathLib/OpenBLAS/blob/develop/GotoBLAS_01Readme.txt). + +## License + +OpenBLAS is licensed under the 3-clause BSD license. The full license can be found [on GitHub](https://github.com/OpenMathLib/OpenBLAS/blob/develop/LICENSE). diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 00000000..ffb4659d --- /dev/null +++ b/docs/install.md @@ -0,0 +1,777 @@ +# Install OpenBLAS + +OpenBLAS can be installed through package managers or from source. If you only +want to use OpenBLAS rather than make changes to it, we recommend installing a +pre-built binary package with your package manager of choice. + +This page contains an overview of installing with package managers as well as +from source. For the latter, see [further down on this page](#building-from-source). + + +## Installing with a package manager + +!!! note + Almost every package manager provides OpenBLAS packages; the list on this + page is not comprehensive. If your package manager of choice isn't shown + here, please search its package database for `openblas` or `libopenblas`. + + +### Linux + +On Linux, OpenBLAS can be installed with the system package manager, or with a +package manager like [Conda](https://docs.conda.io/en/latest/) +(or alternative package managers for the conda-forge ecosystem, like +[Mamba](https://mamba.readthedocs.io/en/latest/), +[Micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html), +or [Pixi](https://pixi.sh/latest/#windows-installer)), +[Spack](https://spack.io/), or [Nix](https://nixos.org/). For the latter set of +tools, the package name in all cases is `openblas`. Since package management in +quite a few of these tools is declarative (i.e., managed by adding `openblas` +to a metadata file describing the dependencies for your project or +environment), we won't attempt to give detailed instructions for these tools here. + +Linux distributions typically split OpenBLAS up in two packages: one containing +the library itself (typically named `openblas` or `libopenblas`), and one containing headers, +pkg-config and CMake files (typically named the same as the package for the +library with `-dev` or `-devel` appended; e.g., `openblas-devel`). Please keep +in mind that if you want to install OpenBLAS in order to use it directly in +your own project, you will need to install both of those packages. + +Distro-specific installation commands: + +=== "Debian/Ubuntu/Mint/Kali" + + ```bash + $ sudo apt update + $ sudo apt install libopenblas-dev + ``` + OpenBLAS can be configured as the default BLAS through the `update-alternatives` mechanism: + + ```bash + $ sudo update-alternatives --config libblas.so.3 + ``` + +=== "openSUSE/SLE" + + ```bash + $ sudo zypper refresh + $ sudo zypper install openblas-devel + ``` + + OpenBLAS can be configured as the default BLAS through the `update-alternatives` mechanism: + ```bash + $ sudo update-alternatives --config libblas.so.3 + ``` + +=== "Fedora/CentOS/RHEL" + + ```bash + $ dnf check-update + $ dnf install openblas-devel + ``` + + !!! warning + + Fedora does not ship the pkg-config files for OpenBLAS. Instead, it wants you to + link against [FlexiBLAS](https://www.mpi-magdeburg.mpg.de/projects/flexiblas) (which + uses OpenBLAS by default as its backend on Fedora), which you can install with: + + ```bash + $ dnf install flexiblas-devel + ``` + + For CentOS and RHEL, OpenBLAS packages are provided via the [Fedora EPEL repository](https://fedoraproject.org/wiki/EPEL). + After adding that repository and its repository keys, you can install + `openblas-devel` with either `dnf` or `yum`. + +=== "Arch/Manjaro/Antergos" + + ```bash + $ sudo pacman -S openblas + ``` + + +### Windows + +=== "Conda-forge" + + OpenBLAS can be installed with `conda` (or `mamba`, `micromamba`, or + `pixi`) from conda-forge: + ``` + conda install openblas + ``` + + Conda-forge provides a method for switching the default BLAS implementation + used by all packages. To use that for OpenBLAS, install `libblas=*=*openblas` + (see [the docs on this mechanism](https://conda-forge.org/docs/maintainer/knowledge_base/#switching-blas-implementation) + for more details). + +=== "vcpkg" + + OpenBLAS can be installed with vcpkg: + ```cmd + # In classic mode: + vcpkg install openblas + + # Or in manifest mode: + vcpkg add port openblas + ``` + +=== "OpenBLAS releases" + + Windows is the only platform for which binaries are made available by the + OpenBLAS project itself. They can be downloaded from the GitHub + Releases](https://github.com/OpenMathLib/OpenBLAS/releases) page. These + binaries are built with MinGW, using the following build options: + ``` + NUM_THREADS=64 TARGET=GENERIC DYNAMIC_ARCH=1 DYNAMIC_OLDER=1 CONSISTENT_FPCSR=1 INTERFACE=0 + ``` + There are separate packages for x86-64 and x86. The zip archive contains + the include files, static and shared libraries, as well as configuration + files for getting them found via CMake or pkg-config. To use these + binaries, create a suitable folder for your OpenBLAS installation and unzip + the `.zip` bundle there (note that you will need to edit the provided + `openblas.pc` and `OpenBLASConfig.cmake` to reflect the installation path + on your computer, as distributed they have "win" or "win64" reflecting the + local paths on the system they were built on). + + Note that the same binaries can be downloaded + [from SourceForge](http://sourceforge.net/projects/openblas/files); this is + mostly of historical interest. + + +### macOS + +To install OpenBLAS with a package manager on macOS, run: + +=== "Homebrew" + + ```zsh + % brew install openblas + ``` + +=== "MacPorts" + + ```zsh + % sudo port install OpenBLAS-devel + ``` + +=== "Conda-forge" + + ```zsh + % conda install openblas + ``` + + Conda-forge provides a method for switching the default BLAS implementation + used by all packages. To use that for OpenBLAS, install `libblas=*=*openblas` + (see [the docs on this mechanism](https://conda-forge.org/docs/maintainer/knowledge_base/#switching-blas-implementation) + for more details). + + +### FreeBSD + +You can install OpenBLAS from the FreeBSD [Ports collection](https://www.freebsd.org/ports/index.html): +``` +pkg install openblas +``` + + +## Building from source + +We recommend download the latest [stable version](https://github.com/OpenMathLib/OpenBLAS/releases) +from the GitHub Releases page, or checking it out from a git tag, rather than a +dev version from the `develop` branch. + +!!! tip + + The User manual contains [a section with detailed information on compiling OpenBLAS](user_manual.md#compiling-openblas), + including how to customize builds and how to cross-compile. Please read + that documentation first. This page contains only platform-specific build + information, and assumes you already understand the general build system + invocations to build OpenBLAS, with the specific build options you want to + control multi-threading and other non-platform-specific behavior). + + +### Linux and macOS + +Ensure you have C and Fortran compilers installed, then simply type `make` to compile the library. +There are no other build dependencies, nor unusual platform-specific +environment variables to set or other system setup to do. + +!!! note + + When building in an emulator (KVM, QEMU, etc.), please make sure that the combination of CPU features exposed to + the virtual environment matches that of an existing CPU to allow detection of the CPU model to succeed. + (With `qemu`, this can be done by passing `-cpu host` or a supported model name at invocation). + + +### Windows + +We support building OpenBLAS with either MinGW or Visual Studio on Windows. +Using MSVC will yield an OpenBLAS build with the Windows platform-native ABI. +Using MinGW will yield a different ABI. We'll describe both methods in detail +in this section, since the process for each is quite different. + +#### Visual Studio & native Windows ABI + +For Visual Studio, you can use CMake to generate Visual Studio solution files; +note that you will need at least CMake 3.11 for linking to work correctly). + +Note that you need a Fortran compiler if you plan to build and use the LAPACK +functions included with OpenBLAS. The sections below describe using either +`flang` as an add-on to clang/LLVM or `gfortran` as part of MinGW for this +purpose. If you want to use the Intel Fortran compiler (`ifort` or `ifx`) for +this, be sure to also use the Intel C compiler (`icc` or `icx`) for building +the C parts, as the ABI imposed by `ifort` is incompatible with MSVC + +A fully-optimized OpenBLAS that can be statically or dynamically linked to your +application can currently be built for the 64-bit architecture with the LLVM +compiler infrastructure. We're going to use [Miniconda3](https://docs.anaconda.com/miniconda/) +to grab all of the tools we need, since some of them are in an experimental +status. Before you begin, you'll need to have Microsoft Visual Studio 2015 or +newer installed. + +1. Install Miniconda3 for 64-bit Windows using `winget install --id Anaconda.Miniconda3`, + or easily download from [conda.io](https://docs.conda.io/en/latest/miniconda.html). +2. Open the "Anaconda Command Prompt" now available in the Start Menu, or at `%USERPROFILE%\miniconda3\shell\condabin\conda-hook.ps1`. +3. In that command prompt window, use `cd` to change to the directory where you want to build OpenBLAS. +4. Now install all of the tools we need: + ``` + conda update -n base conda + conda config --add channels conda-forge + conda install -y cmake flang clangdev perl libflang ninja + ``` +5. Still in the Anaconda Command Prompt window, activate the 64-bit MSVC environment with `vcvarsall x64`. + On Windows 11 with Visual Studio 2022, this would be done by invoking: + + ```shell + "c:\Program Files\Microsoft Visual Studio\2022\Community\vc\Auxiliary\Build\vcvars64.bat" + ``` + + With VS2019, the command should be the same (except for the year number of course). + For other versions of MSVC, please check the Visual Studio documentation for + exactly how to invoke the `vcvars64.bat` script. + + Confirm that the environment is active by typing `link`. This should return + a long list of possible options for the `link` command. If it just returns + _"command not found"_ or similar, review and retype the call to `vcvars64.bat`. + + !!! note + + if you are working from a Visual Studio command prompt window instead + (so that you do not have to do the `vcvars` call), you need to invoke + `conda activate` so that `CONDA_PREFIX` etc. get set up correctly before + proceeding to step 6. Failing to do so will lead to link errors like + `libflangmain.lib` not getting found later in the build. + +6. Now configure the project with CMake. Starting in the project directory, execute the following: + ``` + set "LIB=%CONDA_PREFIX%\Library\lib;%LIB%" + set "CPATH=%CONDA_PREFIX%\Library\include;%CPATH%" + mkdir build + cd build + cmake .. -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 -DDYNAMIC_ARCH=ON -DCMAKE_BUILD_TYPE=Release + ``` + + You may want to add further options in the `cmake` command here. For + instance, the default only produces a static `.lib` version of the library. + If you would rather have a DLL, add `-DBUILD_SHARED_LIBS=ON` above. Note that + this step only creates some command files and directories, the actual build + happens next. + +7. Build the project: + + ``` + cmake --build . --config Release + ``` + This step will create the OpenBLAS library in the `lib` directory, and + various build-time tests in the `test`, `ctest` and `openblas_utest` + directories. However it will not separate the header files you might need + for building your own programs from those used internally. To put all + relevant files in a more convenient arrangement, run the next step. + +8. Install all relevant files created by the build: + + ``` + cmake --install . --prefix c:\opt -v + ``` + This will copy all files that are needed for building and running your own + programs with OpenBLAS to the given location, creating appropriate + subdirectories for the individual kinds of files. In the case of `C:\opt` as + given above, this would be: + + - `C:\opt\include\openblas` for the header files, + - `C:\opt\bin` for the `libopenblas.dll` shared library, + - `C:\opt\lib` for the static library, and + - `C:\opt\share` holds various support files that enable other cmake-based + build scripts to find OpenBLAS automatically. + + +!!! tip "Change in complex types for Visual Studio 2017 and up" + + In newer Visual Studio versions, Microsoft has changed + [how it handles complex types](https://docs.microsoft.com/en-us/cpp/c-runtime-library/complex-math-support?view=msvc-170#types-used-in-complex-math). + Even when using a precompiled version of OpenBLAS, you might need to define + `LAPACK_COMPLEX_CUSTOM` in order to define complex types properly for MSVC. + For example, some variant of the following might help: + + ```c + #if defined(_MSC_VER) + #include + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex + #endif + ``` + + For reference, see + [openblas#3661](https://github.com/OpenMathLib/OpenBLAS/issues/3661), + [lapack#683](https://github.com/Reference-LAPACK/lapack/issues/683), and + [this Stack Overflow question](https://stackoverflow.com/questions/47520244/using-openblas-lapacke-in-visual-studio). + + +!!! warning "Building 32-bit binaries with MSVC" + + This method may produce binaries which demonstrate significantly lower + performance than those built with the other methods. The Visual Studio + compiler does not support the dialect of assembly used in the cpu-specific + optimized files, so only the "generic" `TARGET` which is written in pure C + will get built. For the same reason it is not possible (and not necessary) + to use `-DDYNAMIC_ARCH=ON` in a Visual Studio build. You may consider + building for the 32-bit architecture using the GNU (MinGW) ABI instead. + +##### CMake & Visual Studio integration + +To generate Visual Studio solution files, ensure CMake is installed and then run: +``` +# Do this from Powershell so cmake can find visual studio +cmake -G "Visual Studio 14 Win64" -DCMAKE_BUILD_TYPE=Release . +``` + +To then build OpenBLAS using those solution files from within Visual Studio, we +also need Perl. Please install it and ensure it's on the `PATH` (see, e.g., +[this Stack Overflow question for how](http://stackoverflow.com/questions/3051049/active-perl-installation-on-windows-operating-system)). + +If you build from within Visual Studio, the dependencies may not be +automatically configured: if you try to build `libopenblas` directly, it may +fail with a message saying that some `.obj` files aren't found. If this +happens, you can work around the problem by building the projects that +`libopenblas` depends on before building `libopenblas` itself. + +###### Build OpenBLAS for Universal Windows Platform + +OpenBLAS can be built targeting [Universal Windows Platform](https://en.wikipedia.org/wiki/Universal_Windows_Platform) +(UWP) like this: + +1. Follow the steps above to build the Visual Studio solution files for + Windows. This builds the helper executables which are required when building + the OpenBLAS Visual Studio solution files for UWP in step 2. +2. Remove the generated `CMakeCache.txt` and the `CMakeFiles` directory from + the OpenBLAS source directory, then re-run CMake with the following options: + + ``` + # do this to build UWP compatible solution files + cmake -G "Visual Studio 14 Win64" -DCMAKE_SYSTEM_NAME=WindowsStore -DCMAKE_SYSTEM_VERSION="10.0" -DCMAKE_SYSTEM_PROCESSOR=AMD64 -DVS_WINRT_COMPONENT=TRUE -DCMAKE_BUILD_TYPE=Release . + ``` +3. Now build the solution with Visual Studio. + + +#### MinGW & GNU ABI + +!!! note + + The resulting library from building with MinGW as described below can be + used in Visual Studio, but it can only be linked dynamically. This + configuration has not been thoroughly tested and should be considered + experimental. + + +To build OpenBLAS on Windows with MinGW: + +1. Install the MinGW (GCC) compiler suite, either the 32-bit + [MinGW]((http://www.mingw.org/) or the 64-bit + [MinGW-w64](http://mingw-w64.sourceforge.net/) toolchain. Be sure to install + its `gfortran` package as well (unless you really want to build the BLAS part + of OpenBLAS only) and check that `gcc` and `gfortran` are the same version. + In addition, please install MSYS2 with MinGW. +2. Build OpenBLAS in the MSYS2 shell. Usually, you can just type `make`. + OpenBLAS will detect the compiler and CPU automatically. +3. After the build is complete, OpenBLAS will generate the static library + `libopenblas.a` and the shared library `libopenblas.dll` in the folder. You + can type `make PREFIX=/your/installation/path install` to install the + library to a certain location. + +Note that OpenBLAS will generate the import library `libopenblas.dll.a` for +`libopenblas.dll` by default. + +If you want to generate Windows-native PDB files from a MinGW build, you can +use the [cv2pdb](https://github.com/rainers/cv2pdb) tool to do so. + +To then use the built OpenBLAS shared library in Visual Studio: + +1. Copy the import library (`OPENBLAS_TOP_DIR/libopenblas.dll.a`) and the + shared library (`libopenblas.dll`) into the same folder (this must be the + folder of your project that is going to use the BLAS library. You may need + to add `libopenblas.dll.a` to the linker input list: `properties->Linker->Input`). +2. Please follow the Visual Studio documentation about using third-party .dll + libraries, and make sure to link against a library for the correct + architecture.[^1] +3. If you need CBLAS, you should include `cblas.h` in + `/your/installation/path/include` in Visual Studio. Please see + [openblas#95](http://github.com/OpenMathLib/OpenBLAS/issues/95) for more details. + +[^1]: + If the OpenBLAS DLLs are not linked correctly, you may see an error like + _"The application was unable to start correctly (0xc000007b)"_, which typically + indicates a mismatch between 32-bit and 64-bit libraries. + +!!! info "Limitations of using the MinGW build within Visual Studio" + + - Both static and dynamic linking are supported with MinGW. With Visual + Studio, however, only dynamic linking is supported and so you should use + the import library. + - Debugging from Visual Studio does not work because MinGW and Visual + Studio have incompatible formats for debug information (PDB vs. + DWARF/STABS). You should either debug with GDB on the command line or + with a visual frontend, for instance [Eclipse](http://www.eclipse.org/cdt/) or + [Qt Creator](http://qt.nokia.com/products/developer-tools/). + + +#### Windows on Arm + +The following tools needs to be installed to build for Windows on Arm (WoA): + +- Clang for Windows on Arm. + Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/). + E.g: LLVM 12 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.0/LLVM-12.0.0-woa64.exe) + Run the LLVM installer and ensure that LLVM is added to environment PATH. +- Download and install classic Flang for Windows on Arm. + Classic Flang is the only available Fortran compiler for Windows on Arm for now. + A pre-release build can be found [here](https://github.com/kaadam/flang/releases/tag/v0.1) + There is no installer for classic flang and the zip package can be + extracted and the path needs to be added to environment `PATH`. + E.g., in PowerShell: + ``` + $env:Path += ";C:\flang_woa\bin" + ``` + +The following steps describe how to build the static library for OpenBLAS with and without LAPACK: + +1. Build OpenBLAS static library with BLAS and LAPACK routines with Make: + + ```bash + $ make CC="clang-cl" HOSTCC="clang-cl" AR="llvm-ar" BUILD_WITHOUT_LAPACK=0 NOFORTRAN=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 USE_OPENMP=0 PARALLEL=1 RANLIB="llvm-ranlib" MAKE=make F_COMPILER=FLANG FC=FLANG FFLAGS_NOOPT="-march=armv8-a -cpp" FFLAGS="-march=armv8-a -cpp" NEED_PIC=0 HOSTARCH=arm64 libs netlib + ``` + +2. Build static library with BLAS routines using CMake: + + Classic Flang has compatibility issues with CMake, hence only BLAS routines can be compiled with CMake: + + ```bash + $ mkdir build + $ cd build + $ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang -DBUILD_WITHOUT_LAPACK=1 -DNOFORTRAN=1 -DDYNAMIC_ARCH=0 -DTARGET=ARMV8 -DARCH=arm64 -DBINARY=64 -DUSE_OPENMP=0 -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_CROSSCOMPILING=1 -DCMAKE_SYSTEM_NAME=Windows + $ cmake --build . --config Release + ``` + +!!! tip "`getarch.exe` execution error" + + If you notice that platform-specific headers by `getarch.exe` are not + generated correctly, this could be due to a known debug runtime DLL issue for + arm64 platforms. Please check out [this page](https://linaro.atlassian.net/wiki/spaces/WOAR/pages/28677636097/Debug+run-time+DLL+issue#Workaround) + for a workaround. + + +#### Generating an import library + +Microsoft Windows has this thing called "import libraries". You need it for +MSVC; you don't need it for MinGW because the `ld` linker is smart enough - +however, you may still want it for some reason, so we'll describe the process +for both MSVC and MinGW. + +Import libraries are compiled from a list of what symbols to use, which are +contained in a `.def` file. A `.def` file should be already be present in the +`exports` directory under the top-level OpenBLAS directory after you've run a build. +In your shell, move to this directory: `cd exports`. + +=== "MSVC" + + Unlike MinGW, MSVC absolutely requires an import library. Now the C ABI of + MSVC and MinGW are actually identical, so linking is actually okay (any + incompatibility in the C ABI would be a bug). + + The import libraries of MSVC have the suffix `.lib`. They are generated + from a `.def` file using MSVC's `lib.exe`. See [the MSVC instructions](use_visual_studio.md#generate-import-library-before-0210-version). + +=== "MinGW" + + MinGW import libraries have the suffix `.a`, just like static libraries. + Our goal is to produce the file `libopenblas.dll.a`. + + You need to first insert a line `LIBRARY libopenblas.dll` in `libopenblas.def`: + ``` + cat <(echo "LIBRARY libopenblas.dll") libopenblas.def > libopenblas.def.1 + mv libopenblas.def.1 libopenblas.def + ``` + + Now the `.def` file probably looks like: + ``` + LIBRARY libopenblas.dll + EXPORTS + caxpy=caxpy_ @1 + caxpy_=caxpy_ @2 + ... + ``` + Then, generate the import library: `dlltool -d libopenblas.def -l libopenblas.dll.a` + + _Again, there is basically **no point** in making an import library for use in MinGW. It actually slows down linking._ + + +### Android + +To build OpenBLAS for Android, you will need the following tools installed on your machine: + +- [The Android NDK](https://developer.android.com/ndk/) +- Perl +- Clang compiler on the build machine + +The next two sections below describe how to build with Clang for ARMV7 and +ARMV8 targets, respectively. The same basic principles as described below for +ARMV8 should also apply to building an x86 or x86-64 version (substitute +something like `NEHALEM` for the target instead of `ARMV8`, and replace all the +`aarch64` in the toolchain paths with `x86` or `x96_64` as appropriate). + +!!! info "Historic note" + + Since NDK version 19, the default toolchain is provided as a standalone + toolchain, so building one yourself following + [building a standalone toolchain](http://developer.android.com/ndk/guides/standalone_toolchain.html) + should no longer be necessary. + + +#### Building for ARMV7 + +```bash +# Set path to ndk-bundle +export NDK_BUNDLE_DIR=/path/to/ndk-bundle + +# Set the PATH to contain paths to clang and arm-linux-androideabi-* utilities +export PATH=${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin:${NDK_BUNDLE_DIR}/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH + +# Set LDFLAGS so that the linker finds the appropriate libgcc +export LDFLAGS="-L${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/lib/gcc/arm-linux-androideabi/4.9.x" + +# Set the clang cross compile flags +export CLANG_FLAGS="-target arm-linux-androideabi -marm -mfpu=vfp -mfloat-abi=softfp --sysroot ${NDK_BUNDLE_DIR}/platforms/android-23/arch-arm -gcc-toolchain ${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/" + +#OpenBLAS Compile +make TARGET=ARMV7 ONLY_CBLAS=1 AR=ar CC="clang ${CLANG_FLAGS}" HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 +``` + +On macOS, it may also be necessary to give the complete path to the `ar` +utility in the make command above, like so: +```bash +AR=${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-gcc-ar +``` +otherwise you may get a linker error complaining like `malformed archive header +name at 8` when the native macOS `ar` command was invoked instead. + + +#### Building for ARMV8 + +```bash +# Set path to ndk-bundle +export NDK_BUNDLE_DIR=/path/to/ndk-bundle/ + +# Export PATH to contain directories of clang and aarch64-linux-android-* utilities +export PATH=${NDK_BUNDLE_DIR}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/:${NDK_BUNDLE_DIR}/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH + +# Setup LDFLAGS so that loader can find libgcc and pass -lm for sqrt +export LDFLAGS="-L${NDK_BUNDLE_DIR}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/lib/gcc/aarch64-linux-android/4.9.x -lm" + +# Setup the clang cross compile options +export CLANG_FLAGS="-target aarch64-linux-android --sysroot ${NDK_BUNDLE_DIR}/platforms/android-23/arch-arm64 -gcc-toolchain ${NDK_BUNDLE_DIR}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/" + +# Compile +make TARGET=ARMV8 ONLY_CBLAS=1 AR=ar CC="clang ${CLANG_FLAGS}" HOSTCC=gcc -j4 +``` +Note: using `TARGET=CORTEXA57` in place of `ARMV8` will pick up better +optimized routines. Implementations for the `CORTEXA57` target are compatible +with all other `ARMV8` targets. + +Note: for NDK 23b, something as simple as: +```bash +export PATH=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH +make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8 +``` +appears to be sufficient on Linux. + + +??? note "Alternative build script for 3 architectures" + + This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, `X86`) and install them to `/opt/OpenBLAS/lib`. + It was tested on macOS with NDK version 21.3.6528147. + + ```bash + export NDK=YOUR_PATH_TO_SDK/Android/sdk/ndk/21.3.6528147 + export TOOLCHAIN=$NDK/toolchains/llvm/prebuilt/darwin-x86_64 + + make clean + make \ + TARGET=ARMV7 \ + ONLY_CBLAS=1 \ + CC="$TOOLCHAIN"/bin/armv7a-linux-androideabi21-clang \ + AR="$TOOLCHAIN"/bin/arm-linux-androideabi-ar \ + HOSTCC=gcc \ + ARM_SOFTFP_ABI=1 \ + -j4 + sudo make install + + make clean + make \ + TARGET=CORTEXA57 \ + ONLY_CBLAS=1 \ + CC=$TOOLCHAIN/bin/aarch64-linux-android21-clang \ + AR=$TOOLCHAIN/bin/aarch64-linux-android-ar \ + HOSTCC=gcc \ + -j4 + sudo make install + + make clean + make \ + TARGET=ATOM \ + ONLY_CBLAS=1 \ + CC="$TOOLCHAIN"/bin/i686-linux-android21-clang \ + AR="$TOOLCHAIN"/bin/i686-linux-android-ar \ + HOSTCC=gcc \ + ARM_SOFTFP_ABI=1 \ + -j4 + sudo make install + + ## This will build for x86_64 + make clean + make \ + TARGET=ATOM BINARY=64\ + ONLY_CBLAS=1 \ + CC="$TOOLCHAIN"/bin/x86_64-linux-android21-clang \ + AR="$TOOLCHAIN"/bin/x86_64-linux-android-ar \ + HOSTCC=gcc \ + ARM_SOFTFP_ABI=1 \ + -j4 + sudo make install + ``` + You can find full list of target architectures in [TargetList.txt](https://github.com/OpenMathLib/OpenBLAS/blob/develop/TargetList.txt) + + +### iPhone/iOS + +As none of the current developers uses iOS, the following instructions are what +was found to work in our Azure CI setup, but as far as we know this builds a +fully working OpenBLAS for this platform. + +Go to the directory where you unpacked OpenBLAS,and enter the following commands: +```bash +CC=/Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + +CFLAGS= -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 + +make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 +``` +Adjust `MIN_IOS_VERSION` as necessary for your installation. E.g., change the version number +to the minimum iOS version you want to target and execute this file to build the library. + + +### MIPS + +For MIPS targets you will need latest toolchains: + +- P5600 - MTI GNU/Linux Toolchain +- I6400, P6600 - IMG GNU/Linux Toolchain + +You can use following commandlines for builds: + +```bash +IMG_TOOLCHAIN_DIR={full IMG GNU/Linux Toolchain path including "bin" directory -- for example, /opt/linux_toolchain/bin} +IMG_GCC_PREFIX=mips-img-linux-gnu +IMG_TOOLCHAIN=${IMG_TOOLCHAIN_DIR}/${IMG_GCC_PREFIX} + +# I6400 Build (n32): +make BINARY=32 BINARY32=1 CC=$IMG_TOOLCHAIN-gcc AR=$IMG_TOOLCHAIN-ar FC="$IMG_TOOLCHAIN-gfortran -EL -mabi=n32" RANLIB=$IMG_TOOLCHAIN-ranlib HOSTCC=gcc CFLAGS="-EL" FFLAGS=$CFLAGS LDFLAGS=$CFLAGS TARGET=I6400 + +# I6400 Build (n64): +make BINARY=64 BINARY64=1 CC=$IMG_TOOLCHAIN-gcc AR=$IMG_TOOLCHAIN-ar FC="$IMG_TOOLCHAIN-gfortran -EL" RANLIB=$IMG_TOOLCHAIN-ranlib HOSTCC=gcc CFLAGS="-EL" FFLAGS=$CFLAGS LDFLAGS=$CFLAGS TARGET=I6400 + +# P6600 Build (n32): +make BINARY=32 BINARY32=1 CC=$IMG_TOOLCHAIN-gcc AR=$IMG_TOOLCHAIN-ar FC="$IMG_TOOLCHAIN-gfortran -EL -mabi=n32" RANLIB=$IMG_TOOLCHAIN-ranlib HOSTCC=gcc CFLAGS="-EL" FFLAGS=$CFLAGS LDFLAGS=$CFLAGS TARGET=P6600 + +# P6600 Build (n64): +make BINARY=64 BINARY64=1 CC=$IMG_TOOLCHAIN-gcc AR=$IMG_TOOLCHAIN-ar FC="$IMG_TOOLCHAIN-gfortran -EL" RANLIB=$IMG_TOOLCHAIN-ranlib HOSTCC=gcc CFLAGS="-EL" FFLAGS="$CFLAGS" LDFLAGS="$CFLAGS" TARGET=P6600 + +MTI_TOOLCHAIN_DIR={full MTI GNU/Linux Toolchain path including "bin" directory -- for example, /opt/linux_toolchain/bin} +MTI_GCC_PREFIX=mips-mti-linux-gnu +MTI_TOOLCHAIN=${IMG_TOOLCHAIN_DIR}/${IMG_GCC_PREFIX} + +# P5600 Build: + +make BINARY=32 BINARY32=1 CC=$MTI_TOOLCHAIN-gcc AR=$MTI_TOOLCHAIN-ar FC="$MTI_TOOLCHAIN-gfortran -EL" RANLIB=$MTI_TOOLCHAIN-ranlib HOSTCC=gcc CFLAGS="-EL" FFLAGS=$CFLAGS LDFLAGS=$CFLAGS TARGET=P5600 +``` + + +### FreeBSD + +You will need to install the following tools from the FreeBSD ports tree: + +* lang/gcc +* lang/perl5.12 +* ftp/curl +* devel/gmake +* devel/patch + +To compile run the command: +```bash +$ gmake CC=gcc FC=gfortran +``` + + +### Cortex-M + +Cortex-M is a widely used microcontroller that is present in a variety of +industrial and consumer electronics. A common variant of the Cortex-M is the +`STM32F4xx` series. Here, we will give instructions for building for that +series. + +First, install the embedded Arm GCC compiler from the Arm website. Then, create +the following `toolchain.cmake` file: + +```cmake +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_SYSTEM_PROCESSOR arm) + +set(CMAKE_C_COMPILER "arm-none-eabi-gcc.exe") +set(CMAKE_CXX_COMPILER "arm-none-eabi-g++.exe") + +set(CMAKE_EXE_LINKER_FLAGS "--specs=nosys.specs" CACHE INTERNAL "") + +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) +``` + +Then build OpenBLAS with: +```bash +$ cmake .. -G Ninja -DCMAKE_C_COMPILER=arm-none-eabi-gcc -DCMAKE_TOOLCHAIN_FILE:PATH="toolchain.cmake" -DNOFORTRAN=1 -DTARGET=ARMV5 -DEMBEDDED=1 +``` + +In your embedded application, the following functions need to be provided for OpenBLAS to work correctly: +```C +void free(void* ptr); +void* malloc(size_t size); +``` + +!!! note + + If you are developing for an embedded platform, it is your responsibility + to make sure that the device has sufficient memory for `malloc` calls. + [Libmemory](https://github.com/embeddedartistry/libmemory) + provides one implementation of `malloc` for embedded platforms. diff --git a/docs/logo.svg b/docs/logo.svg new file mode 100644 index 00000000..e5a5df29 --- /dev/null +++ b/docs/logo.svg @@ -0,0 +1,450 @@ + + + + + + image/svg+xml + + OpenBLAS logo + + + + OpenBLAS logo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | + + diff --git a/docs/user_manual.md b/docs/user_manual.md new file mode 100644 index 00000000..7abdcf0b --- /dev/null +++ b/docs/user_manual.md @@ -0,0 +1,305 @@ + +This user manual covers compiling OpenBLAS itself, linking your code to OpenBLAS, +example code to use the C (CBLAS) and Fortran (BLAS) APIs, and some troubleshooting +tips. Compiling OpenBLAS is optional, since you may be able to install with a +package manager. + +!!! Note BLAS API reference documentation + + The OpenBLAS documentation does not contain API reference documentation for + BLAS or LAPACK, since these are standardized APIs, the documentation for + which can be found in other places. If you want to understand every BLAS + and LAPACK function and definition, we recommend reading the + [Netlib BLAS ](http://netlib.org/blas/) and [Netlib LAPACK](http://netlib.org/lapack/) + documentation. + + OpenBLAS does contain a limited number of functions that are non-standard, + these are documented at [OpenBLAS extension functions](extensions.md). + + +## Compiling OpenBLAS + +### Normal compile + +The default way to build and install OpenBLAS from source is with Make: +``` +make # add `-j4` to compile in parallel with 4 processes +make install +``` + +By default, the CPU architecture is detected automatically when invoking +`make`, and the build is optimized for the detected CPU. To override the +autodetection, use the `TARGET` flag: + +``` +# `make TARGET=xxx` sets target CPU: e.g. for an Intel Nehalem CPU: +make TARGET=NEHALEM +``` +The full list of known target CPU architectures can be found in +`TargetList.txt` in the root of the repository. + +### Cross compile + +For a basic cross-compilation with Make, three steps need to be taken: + +- Set the `CC` and `FC` environment variables to select the cross toolchains + for C and Fortran. +- Set the `HOSTCC` environment variable to select the host C compiler (i.e. the + regular C compiler for the machine on which you are invoking the build). +- Set `TARGET` explicitly to the CPU architecture on which the produced + OpenBLAS binaries will be used. + +#### Cross-compilation examples + +Compile the library for ARM Cortex-A9 linux on an x86-64 machine +_(note: install only `gnueabihf` versions of the cross toolchain - see +[this issue comment](https://github.com/OpenMathLib/OpenBLAS/issues/936#issuecomment-237596847) +for why_): +``` +make CC=arm-linux-gnueabihf-gcc FC=arm-linux-gnueabihf-gfortran HOSTCC=gcc TARGET=CORTEXA9 +``` + +Compile OpenBLAS for a loongson3a CPU on an x86-64 machine: +``` +make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A +``` + +Compile OpenBLAS for loongson3a CPU with the `loongcc` (based on Open64) compiler on an x86-64 machine: +``` +make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 +``` + +### Building a debug version + +Add `DEBUG=1` to your build command, e.g.: +``` +make DEBUG=1 +``` + +### Install to a specific directory + +!!! note + + Installing to a directory is optional; it is also possible to use the shared or static + libraries directly from the build directory. + +Use `make install` with the `PREFIX` flag to install to a specific directory: + +``` +make install PREFIX=/path/to/installation/directory +``` + +The default directory is `/opt/OpenBLAS`. + +!!! important + + Note that any flags passed to `make` during build should also be passed to + `make install` to circumvent any install errors, i.e. some headers not + being copied over correctly. + +For more detailed information on building/installing from source, please read +the [Installation Guide](install.md). + + +## Linking to OpenBLAS + +OpenBLAS can be used as a shared or a static library. + +### Link a shared library + +The shared library is normally called `libopenblas.so`, but not that the name +may be different as a result of build flags used or naming choices by a distro +packager (see [distributing.md] for details). To link a shared library named +`libopenblas.so`, the flag `-lopenblas` is needed. To find the OpenBLAS headers, +a `-I/path/to/includedir` is needed. And unless the library is installed in a +directory that the linker searches by default, also `-L` and `-Wl,-rpath` flags +are needed. For a source file `test.c` (e.g., the example code under _Call +CBLAS interface_ further down), the shared library can then be linked with: +``` +gcc -o test test.c -I/your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -Wl,-rpath,/your_path/OpenBLAS/lib -lopenblas +``` + +The `-Wl,-rpath,/your_path/OpenBLAS/lib` linker flag can be omitted if you +ran `ldconfig` to update linker cache, put `/your_path/OpenBLAS/lib` in +`/etc/ld.so.conf` or a file in `/etc/ld.so.conf.d`, or installed OpenBLAS in a +location that is part of the `ld.so` default search path (usually `/lib`, +`/usr/lib` and `/usr/local/lib`). Alternatively, you can set the environment +variable `LD_LIBRARY_PATH` to point to the folder that contains `libopenblas.so`. +Otherwise, the build may succeed but at runtime loading the library will fail +with a message like: +``` +cannot open shared object file: no such file or directory +``` + +More flags may be needed, depending on how OpenBLAS was built: + +- If `libopenblas` is multi-threaded, please add `-lpthread`. +- If the library contains LAPACK functions (usually also true), please add + `-lgfortran` (other Fortran libraries may also be needed, e.g. `-lquadmath`). + Note that if you only make calls to LAPACKE routines, i.e. your code has + `#include "lapacke.h"` and makes calls to methods like `LAPACKE_dgeqrf`, + then `-lgfortran` is not needed. + +!!! tip Use pkg-config + + Usually a pkg-config file (e.g., `openblas.pc`) is installed together + with a `libopenblas` shared library. pkg-config is a tool that will + tell you the exact flags needed for linking. For example: + + ``` + $ pkg-config --cflags openblas + -I/usr/local/include + $ pkg-config --libs openblas + -L/usr/local/lib -lopenblas + ``` + +### Link a static library + +Linking a static library is simpler - add the path to the static OpenBLAS +library to the compile command: +``` +gcc -o test test.c /your/path/libopenblas.a +``` + + +## Code examples + +### Call CBLAS interface + +This example shows calling `cblas_dgemm` in C: + + +```c +#include +#include + +void main() +{ + int i=0; + double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; + double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; + double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5}; + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3); + + for(i=0; i<9; i++) + printf("%lf ", C[i]); + printf("\n"); +} +``` + +To compile this file, save it as `test_cblas_dgemm.c` and then run: +``` +gcc -o test_cblas_open test_cblas_dgemm.c -I/your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran +``` +will result in a `test_cblas_open` executable. + +### Call BLAS Fortran interface + +This example shows calling the `dgemm` Fortran interface in C: + + +```c +#include "stdio.h" +#include "stdlib.h" +#include "sys/time.h" +#include "time.h" + +extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*); + +int main(int argc, char* argv[]) +{ + int i; + printf("test!\n"); + if(argc<4){ + printf("Input Error\n"); + return 1; + } + + int m = atoi(argv[1]); + int n = atoi(argv[2]); + int k = atoi(argv[3]); + int sizeofa = m * k; + int sizeofb = k * n; + int sizeofc = m * n; + char ta = 'N'; + char tb = 'N'; + double alpha = 1.2; + double beta = 0.001; + + struct timeval start,finish; + double duration; + + double* A = (double*)malloc(sizeof(double) * sizeofa); + double* B = (double*)malloc(sizeof(double) * sizeofb); + double* C = (double*)malloc(sizeof(double) * sizeofc); + + srand((unsigned)time(NULL)); + + for (i=0; i `, with `m`, `n`, and `k` input +parameters to the `time_dgemm` executable. + +!!! note + + When calling the Fortran interface from C, you have to deal with symbol name + differences caused by compiler conventions. That is why the `dgemm_` function + call in the example above has a trailing underscore. This is what it looks like + when using `gcc`/`gfortran`, however such details may change for different + compilers. Hence it requires extra support code. The CBLAS interface may be + more portable when writing C code. + + When writing code that needs to be portable and work across different + platforms and compilers, the above code example is not recommended for + usage. Instead, we advise looking at how OpenBLAS (or BLAS in general, since + this problem isn't specific to OpenBLAS) functions are called in widely + used projects like Julia, SciPy, or R. + + +## Troubleshooting + +* Please read the [FAQ](faq.md) first, your problem may be described there. +* Please ensure you are using a recent enough compiler, that supports the + features your CPU provides (example: GCC versions before 4.6 were known to + not support AVX kernels, and before 6.1 AVX512CD kernels). +* The number of CPU cores supported by default is <=256. On Linux x86-64, there + is experimental support for up to 1024 cores and 128 NUMA nodes if you build + the library with `BIGNUMA=1`. +* OpenBLAS does not set processor affinity by default. On Linux, you can enable + processor affinity by commenting out the line `NO_AFFINITY=1` in + `Makefile.rule`. +* On Loongson 3A, `make test` is known to fail with a `pthread_create` error + and an `EAGAIN` error code. However, it will be OK when you run the same + testcase in a shell. diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 75b25d03..b1ec94c2 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -68,6 +68,8 @@ if (USE_THREAD) endif () foreach (float_type ${FLOAT_TYPES}) + GenerateNamedObjects("gemm_batch_thread.c" "" "gemm_batch_thread" 0 "" "" false ${float_type}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateCombinationObjects("zherk_kernel.c" "LOWER;CONJ" "U;N" "HERK" 2 "herk_kernel" false ${float_type}) # TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination diff --git a/driver/level3/Makefile b/driver/level3/Makefile index b8465d4e..c3048384 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -37,7 +37,7 @@ SBLASOBJS += \ ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \ ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) \ - ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) + ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) sgemm_batch_thread.$(SUFFIX) DBLASOBJS += \ dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ @@ -53,7 +53,7 @@ DBLASOBJS += \ dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \ dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \ dsyrk_kernel_U.$(SUFFIX) dsyrk_kernel_L.$(SUFFIX) \ - dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) + dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) dgemm_batch_thread.$(SUFFIX) QBLASOBJS += \ qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \ @@ -103,7 +103,7 @@ CBLASOBJS += \ cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ csyr2k_kernel_U.$(SUFFIX) csyr2k_kernel_L.$(SUFFIX) \ cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \ - cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) + cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) cgemm_batch_thread.$(SUFFIX) ZBLASOBJS += \ zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \ @@ -137,7 +137,7 @@ ZBLASOBJS += \ zherk_kernel_LN.$(SUFFIX) zherk_kernel_LC.$(SUFFIX) \ zsyr2k_kernel_U.$(SUFFIX) zsyr2k_kernel_L.$(SUFFIX) \ zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \ - zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) + zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) zgemm_batch_thread.$(SUFFIX) XBLASOBJS += \ @@ -2942,6 +2942,21 @@ gemm_thread_variable.$(PSUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) +sbgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) diff --git a/driver/level3/gemm_batch_thread.c b/driver/level3/gemm_batch_thread.c new file mode 100644 index 00000000..45d6977b --- /dev/null +++ b/driver/level3/gemm_batch_thread.c @@ -0,0 +1,156 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +void openblas_warning(int verbose, const char * msg); + +#ifdef SMALL_MATRIX_OPT +static int inner_small_matrix_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ + int routine_mode; +#ifndef COMPLEX + int (*gemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG); + int (*gemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG); +#else + int (*zgemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG); + int (*zgemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG); + FLOAT alpha[2], beta[2]; +#endif + routine_mode=args->routine_mode; + if((routine_mode & BLAS_SMALL_B0_OPT) == BLAS_SMALL_B0_OPT){ +#ifndef COMPLEX + gemm_small_kernel_b0=args->routine; + gemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, args->c, args->ldc); +#else + zgemm_small_kernel_b0=args->routine; + alpha[0] = *((FLOAT *)args -> alpha + 0); + alpha[1] = *((FLOAT *)args -> alpha + 1); + zgemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, args->c, args->ldc); +#endif + return(0); + }else if(routine_mode & BLAS_SMALL_OPT){ +#ifndef COMPLEX + gemm_small_kernel=args->routine; + gemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, *(FLOAT *)(args->beta), args->c, args->ldc); +#else + zgemm_small_kernel=args->routine; + alpha[0] = *((FLOAT *)args -> alpha + 0); + alpha[1] = *((FLOAT *)args -> alpha + 1); + beta[0] = *((FLOAT *)args -> beta + 0); + beta[1] = *((FLOAT *)args -> beta + 1); + zgemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, beta[0], beta[1], args->c, args->ldc); +#endif + return(0); + } + return(1); +} +#endif + +int CNAME(blas_arg_t * args_array, BLASLONG nums){ + XFLOAT *buffer; + XFLOAT *sa, *sb; + int nthreads=1; + int (*routine)(blas_arg_t *, void *, void *, XFLOAT *, XFLOAT *, BLASLONG); + int i=0, /*j,*/ current_nums; + +#ifdef SMP + blas_queue_t * queue=NULL; +#endif + + if(nums <=0 ) return 0; + + buffer = (XFLOAT *)blas_memory_alloc(0); + sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); + sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + nthreads=num_cpu_avail(3); + + if(nthreads==1){ + +#endif + //single thread + for(i=0; inthreads)? nthreads: (nums-i); + + queue[i].sa=sa; + queue[i].sb=sb; + queue[i+current_nums-1].next=NULL; + + exec_blas(current_nums, &queue[i]); + } + free(queue); + } +#endif + blas_memory_free(buffer); + return 0; +} diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index c7ccf842..ddb39abd 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -570,6 +570,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); #else static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; + static pthread_cond_t level3_wakeup = PTHREAD_COND_INITIALIZER; + volatile static BLASLONG CPU_AVAILABLE = MAX_CPU_NUMBER; #endif blas_arg_t newarg; @@ -639,6 +641,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); #else pthread_mutex_lock(&level3_lock); + while(CPU_AVAILABLE < nthreads) { + pthread_cond_wait(&level3_wakeup, &level3_lock); + } + CPU_AVAILABLE -= nthreads; + WMB; + pthread_mutex_unlock(&level3_lock); #endif #ifdef USE_ALLOC_HEAP @@ -783,6 +791,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #elif defined(OS_WINDOWS) LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); #else + pthread_mutex_lock(&level3_lock); + CPU_AVAILABLE += nthreads; + WMB; + pthread_cond_signal(&level3_wakeup); pthread_mutex_unlock(&level3_lock); #endif @@ -826,6 +838,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF if (nthreads_m * nthreads_n > args -> nthreads) { nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); } + /* The nthreads_m and nthreads_n are adjusted so that the submatrix */ + /* to be handled by each thread preferably becomes a square matrix */ + /* by minimizing an objective function 'n * nthreads_m + m * nthreads_n'. */ + /* Objective function come from sum of partitions in m and n. */ + /* (n / nthreads_n) + (m / nthreads_m) */ + /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ + while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { + nthreads_m /= 2; + nthreads_n *= 2; + } } /* Execute serial or parallel computation */ diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 1a38740a..659449fb 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -25,6 +25,7 @@ if (USE_THREAD) ${BLAS_SERVER} divtable.c # TODO: Makefile has -UDOUBLE blas_l1_thread.c + blas_server_callback.c ) if (NOT NO_AFFINITY) @@ -51,6 +52,8 @@ if (DYNAMIC_ARCH) list(APPEND COMMON_SOURCES dynamic_arm64.c) elseif (POWER) list(APPEND COMMON_SOURCES dynamic_power.c) + elseif (RISCV64) + list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () diff --git a/driver/others/Makefile b/driver/others/Makefile index e4e9ee10..719d617c 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -6,7 +6,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) blas_server_callback.$(SUFFIX) ifneq ($(NO_AFFINITY), 1) COMMONOBJS += init.$(SUFFIX) endif @@ -30,12 +30,16 @@ else ifeq ($(ARCH),loongarch64) COMMONOBJS += dynamic_loongarch64.$(SUFFIX) else +ifeq ($(ARCH),riscv64) +COMMONOBJS += dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif endif endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -106,12 +110,16 @@ else ifeq ($(ARCH),loongarch64) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX) else +ifeq ($(ARCH),riscv64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif endif endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif @@ -140,6 +148,9 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) +blas_server_callback.$(SUFFIX) : blas_server_callback.c ../../common.h + $(CC) $(CFLAGS) -c $< -o $(@F) + openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -206,6 +217,9 @@ addx.$(SUFFIX) : $(ARCH)/addx.c mulx.$(SUFFIX) : $(ARCH)/mulx.c $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) +detect_riscv64.$(SUFFIX): detect_riscv64.c + $(CC) $(CFLAGS) -c -march=rv64imafdcv $< -o $(@F) + xerbla.$(PSUFFIX) : xerbla.c $(CC) $(PFLAGS) -c $< -o $(@F) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b1582e24..b0409f09 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -115,6 +115,8 @@ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; int blas_omp_threads_local = 1; +static void * blas_thread_buffer[MAX_CPU_NUMBER]; + /* Local Variables */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; @@ -190,6 +192,10 @@ static int main_status[MAX_CPU_NUMBER]; BLASLONG exit_time[MAX_CPU_NUMBER]; #endif +//Prototypes +static void exec_threads(int , blas_queue_t *, int); +static void adjust_thread_buffers(); + static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ @@ -375,7 +381,6 @@ static void* blas_thread_server(void *arg){ /* Thread identifier */ BLASLONG cpu = (BLASLONG)arg; unsigned int last_tick; - void *buffer, *sa, *sb; blas_queue_t *queue; blas_queue_t *tscq; @@ -395,8 +400,6 @@ blas_queue_t *tscq; main_status[cpu] = MAIN_ENTER; #endif - buffer = blas_memory_alloc(2); - #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu); #endif @@ -415,7 +418,7 @@ blas_queue_t *tscq; tscq = atomic_load_queue(&thread_status[cpu].queue); - while(!tscq) { + while(!tscq || tscq == 0x1) { YIELDING; if ((unsigned int)rpcc() - last_tick > thread_timeout) { @@ -456,109 +459,9 @@ blas_queue_t *tscq; start = rpcc(); #endif - if (queue) { - int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; - - atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); - - sa = queue -> sa; - sb = queue -> sb; - -#ifdef SMP_DEBUG - if (queue -> args) { - fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", - cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); - } -#endif - -#ifdef CONSISTENT_FPCSR -#ifdef __aarch64__ - __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); -#else - __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); - __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); -#endif -#endif - -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING1; -#endif - - if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); - - if (sb == NULL) { - if (!(queue -> mode & BLAS_COMPLEX)){ -#ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else -#endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { -#ifdef BUILD_DOUBLE - sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { -#ifdef BUILD_SINGLE - sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else { - /* Other types in future */ - } - } else { -#ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else -#endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ -#ifdef BUILD_COMPLEX16 - sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { -#ifdef BUILD_COMPLEX - sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else { - /* Other types in future */ - } - } - queue->sb=sb; - } - -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING2; -#endif - - if (queue -> mode & BLAS_LEGACY) { - legacy_exec(routine, queue -> mode, queue -> args, sb); - } else - if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; - (pthreadcompat)(queue -> args); - } else - (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); - -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); -#endif - -#ifdef MONITOR - main_status[cpu] = MAIN_FINISH; -#endif - - // arm: make sure all results are written out _before_ - // thread is marked as done and other threads use them - MB; - atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); - - - } + if(queue) { + exec_threads(cpu, queue, 0); + } #ifdef MONITOR main_status[cpu] = MAIN_DONE; @@ -580,8 +483,6 @@ blas_queue_t *tscq; fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); #endif - blas_memory_free(buffer); - //pthread_exit(NULL); return NULL; @@ -663,6 +564,9 @@ int blas_thread_init(void){ LOCK_COMMAND(&server_lock); + // Adjust thread buffers + adjust_thread_buffers(); + if (!blas_server_avail){ thread_timeout_env=openblas_thread_timeout(); @@ -691,6 +595,8 @@ int blas_thread_init(void){ struct rlimit rlim; const char *msg = strerror(ret); fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg); + fprintf(STDERR, "OpenBLAS blas_thread_init: ensure that your address space and process count limits are big enough (ulimit -a)\n"); + fprintf(STDERR, "OpenBLAS blas_thread_init: or set a smaller OPENBLAS_NUM_THREADS to fit into what you have available\n"); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " @@ -893,6 +799,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); #endif +//Redirect to caller's callback routine +if (openblas_threads_callback_) { + int buf_index = 0, i = 0; +#ifndef USE_SIMPLE_THREADED_LEVEL3 + for (i = 0; i < num; i ++) + queue[i].position = i; +#endif + openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index); + return 0; + } + + if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); #ifdef TIMING_DEBUG @@ -996,7 +914,7 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; -#if defined(ARCH_MIPS64) +#if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64) #ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); @@ -1056,6 +974,14 @@ int BLASFUNC(blas_thread_shutdown)(void){ LOCK_COMMAND(&server_lock); + //Free buffers allocated for threads + for(i=0; i routine; + + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); + + void *buffer = blas_thread_buffer[cpu]; + void *sa = queue -> sa; + void *sb = queue -> sb; + +#ifdef SMP_DEBUG + if (queue -> args) { +fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); + } +#endif + +#ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; +#endif + +//For target LOONGSON3R5, applying an offset to the buffer is essential +//for minimizing cache conflicts and optimizing performance. +#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) + if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); +#endif + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { +if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { +#ifdef BUILD_DOUBLE + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else { + /* Other types in future */ + } +} else { +#ifdef EXPRECISION + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else { + /* Other types in future */ + } +} +queue->sb=sb; + } + +#ifdef MONITOR +main_status[cpu] = MAIN_RUNNING2; +#endif + + if (queue -> mode & BLAS_LEGACY) { +legacy_exec(routine, queue -> mode, queue -> args, sb); + } else +if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; + (pthreadcompat)(queue -> args); +} else + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_FINISH; #endif + // arm: make sure all results are written out _before_ + // thread is marked as done and other threads use them + MB; + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); + +} + +#endif diff --git a/driver/others/blas_server_callback.c b/driver/others/blas_server_callback.c new file mode 100644 index 00000000..48cf3541 --- /dev/null +++ b/driver/others/blas_server_callback.c @@ -0,0 +1,12 @@ +#include "common.h" + +/* global variable to change threading backend from openblas-managed to caller-managed */ +openblas_threads_callback openblas_threads_callback_ = 0; + +/* non-threadsafe function should be called before any other + openblas function to change how threads are managed */ + +void openblas_set_threads_callback_function(openblas_threads_callback callback) +{ + openblas_threads_callback_ = callback; +} \ No newline at end of file diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 6f2ea862..06862cec 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -113,7 +113,7 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; adjust_thread_buffers(); -#if defined(ARCH_MIPS64) +#if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64) //set parameters for different number of threads. blas_set_parameter(); #endif @@ -285,7 +285,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -static void exec_threads(blas_queue_t *queue, int buf_index){ +static void exec_threads(int thread_num, blas_queue_t *queue, int buf_index){ void *buffer, *sa, *sb; int pos=0, release_flag=0; @@ -305,7 +305,7 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { - pos = omp_get_thread_num(); + pos= thread_num; buffer = blas_thread_buffer[buf_index][pos]; //fallback @@ -420,18 +420,25 @@ while (true) { break; } } - if (i != MAX_PARALLEL_NUMBER) - break; -} -if (openblas_omp_adaptive_env() != 0) { -#pragma omp parallel for num_threads(num) schedule(OMP_SCHED) - for (i = 0; i < num; i ++) { + if(i != MAX_PARALLEL_NUMBER) + break; + } + /*For caller-managed threading, if caller has registered the callback, pass exec_thread as callback function*/ + if (openblas_threads_callback_) { +#ifndef USE_SIMPLE_THREADED_LEVEL3 + for (i = 0; i < num; i ++) + queue[i].position = i; +#endif + openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index); + } else { + if (openblas_omp_adaptive_env() != 0) { + #pragma omp parallel for num_threads(num) schedule(OMP_SCHED) + for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 queue[i].position = i; #endif - - exec_threads(&queue[i], buf_index); + exec_threads(omp_get_thread_num(), &queue[i], buf_index); } } else { #pragma omp parallel for schedule(OMP_SCHED) @@ -441,9 +448,10 @@ if (openblas_omp_adaptive_env() != 0) { queue[i].position = i; #endif - exec_threads(&queue[i], buf_index); + exec_threads(omp_get_thread_num(), &queue[i], buf_index); } } +} #ifdef HAVE_C11 atomic_store(&blas_buffer_inuse[buf_index], false); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 788a23b0..5d792eaa 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -1,612 +1,589 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include -#include "common.h" - -#if !defined(unlikely) -#ifdef __GNUC__ -#define unlikely(x) __builtin_expect(!!(x), 0) -#else -#define unlikely(x) (x) -#endif -#endif - -#ifdef SMP_DEBUG -# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) -#else -# define MT_TRACE(...) -#endif - -/* This is a thread implementation for Win32 lazy implementation */ - -/* Thread server common information */ - -static blas_queue_t *work_queue = NULL; -static HANDLE kickoff_event = NULL; -static CRITICAL_SECTION queue_lock; - -/* We need this global for checking if initialization is finished. */ -int blas_server_avail = 0; - -int blas_omp_threads_local = 1; - -/* Local Variables */ -static BLASULONG server_lock = 0; - -static HANDLE blas_threads [MAX_CPU_NUMBER]; -static DWORD blas_threads_id[MAX_CPU_NUMBER]; -static volatile int thread_target; // target num of live threads, volatile for cross-thread reads - -// -// Legacy code path -// -static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { - - if (!(mode & BLAS_COMPLEX)) { -#ifdef EXPRECISION - if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ - /* REAL / Extended Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, - xdouble *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((xdouble *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); - } else -#endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE) { - /* REAL / Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, - double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((double *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { - /* REAL / Single */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, - float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((float *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); -#ifdef BUILD_BFLOAT16 - } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { - /* REAL / BFLOAT16 */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, - bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, - bfloat16 *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((bfloat16 *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { - /* REAL / BLAS_STOBF16 */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, - float *, BLASLONG, bfloat16 *, BLASLONG, - float *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((float *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { - /* REAL / BLAS_DTOBF16 */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, - double *, BLASLONG, bfloat16 *, BLASLONG, - double *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((double *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); -#endif - } else { - /* REAL / Other types in future */ - } - } else { -#ifdef EXPRECISION - if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { - /* COMPLEX / Extended Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, - xdouble *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((xdouble *)args -> alpha)[0], - ((xdouble *)args -> alpha)[1], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); - } else -#endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE) { - /* COMPLEX / Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, - double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((double *)args -> alpha)[0], - ((double *)args -> alpha)[1], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { - /* COMPLEX / Single */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, - float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((float *)args -> alpha)[0], - ((float *)args -> alpha)[1], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); - } else { - /* COMPLEX / Other types in future */ - } - } -} - -// -// This is a main routine of threads. Each thread waits until job is queued. -// -static DWORD WINAPI blas_thread_server(void *arg) { - - /* Thread identifier */ - BLASLONG cpu = (BLASLONG)arg; - - void *buffer, *sa, *sb; - blas_queue_t *queue; - - /* Each server needs each buffer */ - buffer = blas_memory_alloc(2); - - MT_TRACE("Server[%2ld] Thread is started!\n", cpu); - - while (1) { - - /* Waiting for Queue */ - - MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); - - // event raised when work is added to the queue - WaitForSingleObject(kickoff_event, INFINITE); - - if (cpu > thread_target - 2) { - //MT_TRACE("thread [%d] exiting.\n", cpu); - break; // excess thread, so worker thread exits - } - - MT_TRACE("Server[%2ld] Got it.\n", cpu); - - EnterCriticalSection(&queue_lock); - - queue = work_queue; - if (queue) - work_queue = work_queue->next; - - LeaveCriticalSection(&queue_lock); - - if (queue) { - int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - - sa = queue -> sa; - sb = queue -> sb; - - #ifdef CONSISTENT_FPCSR - __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); - __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); - #endif - - MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", - cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); - - // fprintf(stderr, "queue start[%ld]!!!\n", cpu); - - #ifdef MONITOR - main_status[cpu] = MAIN_RUNNING1; - #endif - - if (sa == NULL) - sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); - - if (sb == NULL) { - if (!(queue -> mode & BLAS_COMPLEX)) { -#ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { - sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else -#endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { -#ifdef BUILD_DOUBLE - sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { -#ifdef BUILD_SINGLE - sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else { - /* Other types in future */ - } - } else { -#ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else -#endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ -#ifdef BUILD_COMPLEX16 - sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { -#ifdef BUILD_COMPLEX - sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else { - /* Other types in future */ - } - } - queue->sb=sb; - } - - #ifdef MONITOR - main_status[cpu] = MAIN_RUNNING2; - #endif - - if (!(queue -> mode & BLAS_LEGACY)) { - (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); - } else { - legacy_exec(routine, queue -> mode, queue -> args, sb); - } - } else { - continue; //if queue == NULL - } - - MT_TRACE("Server[%2ld] Finished!\n", cpu); - - queue->finished = 1; - } - - /* Shutdown procedure */ - - MT_TRACE("Server[%2ld] Shutdown!\n", cpu); - - blas_memory_free(buffer); - - return 0; -} - -// -// Initializing routine -// -int blas_thread_init(void) { - BLASLONG i; - - if (blas_server_avail || (blas_cpu_number <= 1)) return 0; - - LOCK_COMMAND(&server_lock); - - MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); - - if (!blas_server_avail) { - // create the kickoff Event - kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - - thread_target = blas_cpu_number; - - InitializeCriticalSection(&queue_lock); - - for(i = 0; i < blas_cpu_number - 1; i++) { - //MT_TRACE("thread_init: creating thread [%d]\n", i); - - blas_threads[i] = CreateThread(NULL, 0, - blas_thread_server, (void *)i, - 0, &blas_threads_id[i]); - } - - blas_server_avail = 1; - } - - UNLOCK_COMMAND(&server_lock); - - return 0; -} - -// -// User can call one of two routines. -// exec_blas_async ... immediately returns after jobs are queued. -// exec_blas ... returns after jobs are finished. -// -int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { - -#if defined(SMP_SERVER) - // Handle lazy re-init of the thread-pool after a POSIX fork - // on Cygwin or as delayed init when a static library is used - if (unlikely(blas_server_avail == 0)) blas_thread_init(); -#endif - - blas_queue_t *current; - - current = queue; - - while (current) { - current -> position = pos; - -#ifdef CONSISTENT_FPCSR - __asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode)); - __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); -#endif - - current->finished = 0; - current = current -> next; - pos ++; - } - - EnterCriticalSection(&queue_lock); - - if (!work_queue) - { - work_queue = queue; - } - else - { - blas_queue_t *queue_item = work_queue; - - // find the end of the work queue - while (queue_item->next) - queue_item = queue_item->next; - - // add new work to the end - queue_item->next = queue; - } - - LeaveCriticalSection(&queue_lock); - - SetEvent(kickoff_event); - - return 0; -} - -// -// Join. Wait for all queued tasks to complete -// -int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { - - MT_TRACE("Synchronization Waiting.\n"); - - while (num) { - MT_TRACE("Waiting Queue ..\n"); - - while (!queue->finished) - YIELDING; - - queue = queue->next; - num--; - } - - MT_TRACE("Completely Done.\n\n"); - - // if work was added to the queue after this batch we can't sleep the worker threads - // by resetting the event - EnterCriticalSection(&queue_lock); - - if (work_queue == NULL) - ResetEvent(kickoff_event); - - LeaveCriticalSection(&queue_lock); - - return 0; -} - -// -// Execute Threads -// -int exec_blas(BLASLONG num, blas_queue_t *queue) { - -#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) - // Handle lazy re-init of the thread-pool after a POSIX fork - if (unlikely(blas_server_avail == 0)) blas_thread_init(); -#endif - -#ifndef ALL_THREADED - int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); -#endif - - if ((num <= 0) || (queue == NULL)) return 0; - - if ((num > 1) && queue -> next) - exec_blas_async(1, queue -> next); - - routine = queue -> routine; - - if (queue -> mode & BLAS_LEGACY) { - legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); - } else { - if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = queue -> routine; - (pthreadcompat)(queue -> args); - } else - (routine)(queue -> args, queue -> range_m, queue -> range_n, - queue -> sa, queue -> sb, 0); - } - - if ((num > 1) && queue -> next) - exec_blas_async_wait(num - 1, queue -> next); - - return 0; -} - -// -// Shutdown procedure, but user don't have to call this routine. The -// kernel automatically kill threads. -// -int BLASFUNC(blas_thread_shutdown)(void) { - - int i; - - if (!blas_server_avail) return 0; - - LOCK_COMMAND(&server_lock); - - if (blas_server_avail) { - - for (i = 0; i < blas_num_threads - 1; i++) { - // Could also just use WaitForMultipleObjects - DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); - -#ifndef OS_WINDOWSSTORE - // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP - if (WAIT_OBJECT_0 != wait_thread_value) { - TerminateThread(blas_threads[i],0); - } -#endif - - CloseHandle(blas_threads[i]); - } - - blas_server_avail = 0; - } - - UNLOCK_COMMAND(&server_lock); - - return 0; -} - -// -// Legacy function to set numbef of threads -// -void goto_set_num_threads(int num_threads) -{ - long i; - -#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) - // Handle lazy re-init of the thread-pool after a POSIX fork - if (unlikely(blas_server_avail == 0)) blas_thread_init(); -#endif - - if (num_threads < 1) num_threads = blas_cpu_number; - - if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; - - if (blas_server_avail && num_threads < blas_num_threads) { - LOCK_COMMAND(&server_lock); - - thread_target = num_threads; - - SetEvent(kickoff_event); - - for (i = num_threads - 1; i < blas_num_threads - 1; i++) { - //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); - - WaitForSingleObject(blas_threads[i], INFINITE); - - //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); - - CloseHandle(blas_threads[i]); - } - - blas_num_threads = num_threads; - - ResetEvent(kickoff_event); - - UNLOCK_COMMAND(&server_lock); - } - - if (num_threads > blas_num_threads) { - - LOCK_COMMAND(&server_lock); - - thread_target = num_threads; - - //increased_threads = 1; - if (!blas_server_avail) { - // create the kickoff Event - kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - - InitializeCriticalSection(&queue_lock); - - blas_server_avail = 1; - } - - for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { - //MT_TRACE("set_num_threads: creating thread [%d]\n", i); - - blas_threads[i] = CreateThread(NULL, 0, - blas_thread_server, (void *)i, - 0, &blas_threads_id[i]); - } - - blas_num_threads = num_threads; - - UNLOCK_COMMAND(&server_lock); - } - - blas_cpu_number = num_threads; -} - -// -// Openblas function to set thread count -// -void openblas_set_num_threads(int num) -{ - goto_set_num_threads(num); -} +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#if !defined(unlikely) +#ifdef __GNUC__ +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define unlikely(x) (x) +#endif +#endif + +/* This is a thread implementation for Win32 lazy implementation */ + +/* Thread server common information */ +typedef struct{ + CRITICAL_SECTION lock; + HANDLE filled; + HANDLE killed; + + blas_queue_t *queue; /* Parameter Pointer */ + int shutdown; /* server shutdown flag */ + +} blas_pool_t; + +/* We need this global for checking if initialization is finished. */ +int blas_server_avail = 0; +int blas_omp_threads_local = 1; +/* Local Variables */ +static BLASULONG server_lock = 0; + +static blas_pool_t pool; +static HANDLE blas_threads [MAX_CPU_NUMBER]; +static DWORD blas_threads_id[MAX_CPU_NUMBER]; + + + +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ + + if (!(mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ + /* REAL / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + /* REAL / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#ifdef BUILD_BFLOAT16 + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + /* REAL / BFLOAT16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((bfloat16 *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + /* REAL / BLAS_STOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + /* REAL / BLAS_DTOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#endif + } else { + /* REAL / Other types in future */ + } + } else { +#ifdef EXPRECISION + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ + /* COMPLEX / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + ((xdouble *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + /* COMPLEX / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + ((double *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { + /* COMPLEX / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + ((float *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* COMPLEX / Other types in future */ + } + } +} + +/* This is a main routine of threads. Each thread waits until job is */ +/* queued. */ + +static DWORD WINAPI blas_thread_server(void *arg){ + + /* Thread identifier */ +#ifdef SMP_DEBUG + BLASLONG cpu = (BLASLONG)arg; +#endif + + void *buffer, *sa, *sb; + blas_queue_t *queue; + DWORD action; + HANDLE handles[] = {pool.filled, pool.killed}; + + /* Each server needs each buffer */ + buffer = blas_memory_alloc(2); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); +#endif + + while (1){ + + /* Waiting for Queue */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); +#endif + + do { + action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); + } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); + + if (action == WAIT_OBJECT_0 + 1) break; + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); +#endif + + EnterCriticalSection(&pool.lock); + + queue = pool.queue; + if (queue) pool.queue = queue->next; + + LeaveCriticalSection(&pool.lock); + + if (queue) { + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + + if (pool.queue) SetEvent(pool.filled); + + sa = queue -> sa; + sb = queue -> sb; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); +#endif + + // fprintf(stderr, "queue start[%ld]!!!\n", cpu); + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; +#endif + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_DOUBLE + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else { + /* Other types in future */ + } + } else { +#ifdef EXPRECISION + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else { + /* Other types in future */ + } + } + queue->sb=sb; + } + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; +#endif + + if (!(queue -> mode & BLAS_LEGACY)) { + + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + } else { + legacy_exec(routine, queue -> mode, queue -> args, sb); + } + }else{ + continue; //if queue == NULL + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); +#endif + + EnterCriticalSection(&queue->lock); + + queue -> status = BLAS_STATUS_FINISHED; + + LeaveCriticalSection(&queue->lock); + + SetEvent(queue->finish); + } + + /* Shutdown procedure */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); +#endif + + blas_memory_free(buffer); + + return 0; + } + +/* Initializing routine */ +int blas_thread_init(void){ + BLASLONG i; + + if (blas_server_avail || (blas_cpu_number <= 1)) return 0; + + LOCK_COMMAND(&server_lock); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", + blas_cpu_number); +#endif + + if (!blas_server_avail){ + + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + + pool.shutdown = 0; + pool.queue = NULL; + + for(i = 0; i < blas_cpu_number - 1; i++){ + blas_threads[i] = CreateThread(NULL, 0, + blas_thread_server, (void *)i, + 0, &blas_threads_id[i]); + } + + blas_server_avail = 1; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +/* + User can call one of two routines. + + exec_blas_async ... immediately returns after jobs are queued. + + exec_blas ... returns after jobs are finished. +*/ + +int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ + +#if defined(SMP_SERVER) + // Handle lazy re-init of the thread-pool after a POSIX fork + // on Cygwin or as delayed init when a static library is used + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + + blas_queue_t *current; + + current = queue; + + while (current) { + InitializeCriticalSection(¤t -> lock); + current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); + current -> position = pos; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode)); + __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); +#endif + + current = current -> next; + pos ++; + } + + EnterCriticalSection(&pool.lock); + + if (pool.queue) { + current = pool.queue; + while (current -> next) current = current -> next; + current -> next = queue; + } else { + pool.queue = queue; + } + + LeaveCriticalSection(&pool.lock); + + SetEvent(pool.filled); + + return 0; +} + +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Synchronization Waiting.\n"); +#endif + + while (num){ +#ifdef SMP_DEBUG + fprintf(STDERR, "Waiting Queue ..\n"); +#endif + + WaitForSingleObject(queue->finish, INFINITE); + + CloseHandle(queue->finish); + DeleteCriticalSection(&queue -> lock); + + queue = queue -> next; + num --; + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Completely Done.\n\n"); +#endif + + return 0; +} + +/* Execute Threads */ +int exec_blas(BLASLONG num, blas_queue_t *queue){ + +#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + +#ifndef ALL_THREADED + int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); +#endif + + if ((num <= 0) || (queue == NULL)) return 0; + + if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + + routine = queue -> routine; + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else + (routine)(queue -> args, queue -> range_m, queue -> range_n, + queue -> sa, queue -> sb, 0); + + if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + + return 0; +} + +/* Shutdown procedure, but user don't have to call this routine. The */ +/* kernel automatically kill threads. */ + +int BLASFUNC(blas_thread_shutdown)(void){ + + int i; + + if (!blas_server_avail) return 0; + + LOCK_COMMAND(&server_lock); + + if (blas_server_avail){ + + SetEvent(pool.killed); + + for(i = 0; i < blas_num_threads - 1; i++){ + // Could also just use WaitForMultipleObjects + DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); + +#ifndef OS_WINDOWSSTORE + // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP + if (WAIT_OBJECT_0 != wait_thread_value) { + TerminateThread(blas_threads[i],0); + } +#endif + + CloseHandle(blas_threads[i]); + } + + CloseHandle(pool.filled); + CloseHandle(pool.killed); + + blas_server_avail = 0; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +void goto_set_num_threads(int num_threads) +{ + long i; + +#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + + if (num_threads < 1) num_threads = blas_cpu_number; + + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + + if (num_threads > blas_num_threads) { + + LOCK_COMMAND(&server_lock); + + //increased_threads = 1; + if (!blas_server_avail){ + + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + + pool.shutdown = 0; + pool.queue = NULL; + blas_server_avail = 1; + } + + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ + + blas_threads[i] = CreateThread(NULL, 0, + blas_thread_server, (void *)i, + 0, &blas_threads_id[i]); + } + + blas_num_threads = num_threads; + + UNLOCK_COMMAND(&server_lock); + } + + blas_cpu_number = num_threads; +} + +void openblas_set_num_threads(int num) +{ + goto_set_num_threads(num); +} diff --git a/driver/others/detect_riscv64.c b/driver/others/detect_riscv64.c new file mode 100644 index 00000000..5a5cc039 --- /dev/null +++ b/driver/others/detect_riscv64.c @@ -0,0 +1,75 @@ +/***************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include + +#ifdef __riscv_v_intrinsic +#include +#endif + +unsigned detect_riscv64_get_vlenb(void) { +#ifdef __riscv_v_intrinsic + return __riscv_vlenb(); +#else + return 0; +#endif +} + +/* + * Based on the approach taken here: + * https://code.videolan.org/videolan/dav1d/-/merge_requests/1629 + * + * Only to be called after we've determined we have some sort of + * RVV support. + */ + +uint64_t detect_riscv64_rvv100(void) +{ + uint64_t rvv10_supported; + + /* + * After the vsetvli statement vtype will either be a value > 0 if the + * vsetvli succeeded or less than 0 if it failed. If 0 < vtype + * we're good and the function will return 1, otherwise there's no + * RVV 1.0 and we return 0. + */ + + asm volatile("vsetvli x0, x0, e8, m1, ta, ma\n\t" + "csrr %0, vtype\n\t" + "slt %0, x0, %0\n" + : "=r" (rvv10_supported) + : + :); + + return rvv10_supported; +} + diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index e3f90526..1f714200 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -927,6 +927,7 @@ static gotoblas_t *get_coretype(void){ case 0x7: switch (exmodel) { case 5: + case 6: if (support_avx2()) return &gotoblas_ZEN; else diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 6b21028d..dc88d816 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -120,6 +120,11 @@ extern gotoblas_t gotoblas_CORTEXA55; #else #define gotoblas_CORTEXA55 gotoblas_ARMV8 #endif +#ifdef DYN_A64FX +extern gotoblas_t gotoblas_A64FX; +#else +#define gotoblas_A64FX gotoblas_ARMV8 +#endif #else extern gotoblas_t gotoblas_CORTEXA53; #define gotoblas_CORTEXA55 gotoblas_CORTEXA53 @@ -136,10 +141,12 @@ extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; extern gotoblas_t gotoblas_ARMV8SVE; +extern gotoblas_t gotoblas_A64FX; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_ARMV8SVE gotoblas_ARMV8 +#define gotoblas_A64FX gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; #endif @@ -149,7 +156,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 17 +#define NUM_CORETYPES 18 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -184,6 +191,7 @@ static char *corename[] = { "thunderx3t110", "cortexa55", "armv8sve", + "a64fx", "unknown" }; @@ -205,6 +213,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; + if (gotoblas == &gotoblas_A64FX) return corename[17]; return corename[NUM_CORETYPES]; } @@ -241,6 +250,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 14: return (&gotoblas_THUNDERX3T110); case 15: return (&gotoblas_CORTEXA55); case 16: return (&gotoblas_ARMV8SVE); + case 17: return (&gotoblas_A64FX); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -346,6 +356,15 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX3T110; } break; + case 0x46: // Fujitsu + switch (part) + { +#ifndef NO_SVE + case 0x001: // A64FX + return &gotoblas_A64FX; +#endif + } + break; case 0x48: // HiSilicon switch (part) { diff --git a/driver/others/dynamic_riscv64.c b/driver/others/dynamic_riscv64.c new file mode 100644 index 00000000..78e3bb67 --- /dev/null +++ b/driver/others/dynamic_riscv64.c @@ -0,0 +1,269 @@ +/***************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include + +#include "common.h" + +/* + * OpenBLAS contains some kernels that are optimised for RVV 1.0. Before we + * can use these kernels we need to determine whether the device supports + * RVV 1.0 and what the device's VLEN is. Our strategy will be as follows. + * + * First we'll invoke the hwprobe syscall to detect RVV 1.0. In an ideal world, + * this is all we should need to do. If the syscall is not implemented we + * should be able to deduce that RVV 1.0 is not supported (as it was added to + * Linux after hwprobe) and if the syscall is implemented we can use it to + * determine whether RVV 1.0 is supported. However, there are some riscv64 + * boards out there that implement RVV 1.0 but ship with a Linux kernel that + * predates RVV vector support and hwprobe support. These kernels contain + * the backported RVV patches but not the hwprobe patches and so they + * advertise support for RVV via hwcap. To cater for these boards we need + * to fall back to hwcap if hwprobe is not supported. Unfortunately, some + * boards indicate support for RVV via hwcap even though they only support + * RVV 0.7.1, which is incompatible with RVV 1.0. So an additional check is + * required to test if the devices advertising support for RVV via hwcap really + * support RVV 1.0. This test works by executing a vsetvli instruction that + * sets the tail agnostic and mask agnostic bits in the vtype register. + * These bits are not supported prior to RVV 0.9 so will cause the VIL bit to + * be set on the VTYPE register in CPUs supporting 0.7.1. If this bit is set + * we can determine that RVV 1.0 is not supported. + * + * This approach is borrowed from + * VideoLan dav1d: + * (https://code.videolan.org/videolan/dav1d/-/merge_requests/1629). + * + * We assume that if a kernel reports the presence of RVV via hwcap that + * the device supports the vsetvli instruction. + * + * For now we're just going to invoke the hwprobe syscall directly, rather than + * invoking it through glibc. Support for hwprobe has been added to glibc but + * at the time of writing this support has not yet been included in a glibc + * release. Once it has, it will be better to invoke hwprobe via glibc as doing + * so should take advantage of the vdso entry and be more efficient. + */ + +/* + * This should work on Android as well but I have no way of testing. + */ + +#if defined(OS_LINUX) +#include +#include +#include +#include + +#define DETECT_RISCV64_HWCAP_ISA_V (1 << ('V' - 'A')) + +struct riscv_hwprobe { + int64_t key; + uint64_t value; +}; + +/* The constants below are copied from + * /usr/include/riscv64-linux-gnu/asm/hwprobe.h. We duplicate the + * constants as the header file from which they are copied will only + * be present if we're building on a device with Linux 6.5 or greater. + */ + +#define RISCV_HWPROBE_KEY_IMA_EXT_0 4 +#define RISCV_HWPROBE_IMA_V (1 << 2) + +#ifndef NR_riscv_hwprobe +#ifndef NR_arch_specific_syscall +#define NR_arch_specific_syscall 244 +#endif +#define NR_riscv_hwprobe (NR_arch_specific_syscall + 14) +#endif +#endif // defined(OS_LINUX) + +unsigned detect_riscv64_get_vlenb(void); +uint64_t detect_riscv64_rvv100(void); + +extern gotoblas_t gotoblas_RISCV64_GENERIC; +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) +extern gotoblas_t gotoblas_RISCV64_ZVL256B; +#endif +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) +extern gotoblas_t gotoblas_RISCV64_ZVL128B; +#endif + +#define CPU_GENERIC 0 +#define CPU_RISCV64_ZVL256B 1 +#define CPU_RISCV64_ZVL128B 2 + +static char *cpuname[] = { + "riscv64_generic", + "riscv64_zvl256b", + "riscv64_zvl128b" +}; +#define NUM_CORETYPES (sizeof(cpuname)/sizeof(char*)) + +extern int openblas_verbose(void); +extern void openblas_warning(int verbose, const char* msg); + +char* gotoblas_corename(void) { +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) + if (gotoblas == &gotoblas_RISCV64_ZVL256B) + return cpuname[CPU_RISCV64_ZVL256B]; +#endif +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) + if (gotoblas == &gotoblas_RISCV64_ZVL128B) + return cpuname[CPU_RISCV64_ZVL128B]; +#endif + if (gotoblas == &gotoblas_RISCV64_GENERIC) + return cpuname[CPU_GENERIC]; + + return "unknown"; +} + +static gotoblas_t* get_coretype(void) { + unsigned vlenb = 0; + +#if !defined(OS_LINUX) + return NULL; +#else + + /* + * See the hwprobe documentation + * + * ( https://docs.kernel.org/arch/riscv/hwprobe.html ) + * for more details. + */ + + struct riscv_hwprobe pairs[] = { + { .key = RISCV_HWPROBE_KEY_IMA_EXT_0, }, + }; + int ret = syscall(NR_riscv_hwprobe, pairs, 1, 0, NULL, 0); + if (ret == 0) { + if (!(pairs[0].value & RISCV_HWPROBE_IMA_V)) + return NULL; + } else { + if (!(getauxval(AT_HWCAP) & DETECT_RISCV64_HWCAP_ISA_V)) + return NULL; + + if (!detect_riscv64_rvv100()) + return NULL; + } + + /* + * RVV 1.0 is supported. We now just need to determine the coretype + * based on the VLEN. + */ + + vlenb = detect_riscv64_get_vlenb(); + + if (vlenb < 16) + return NULL; +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) + if (vlenb >= 32) + return &gotoblas_RISCV64_ZVL256B; +#endif + +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) + return &gotoblas_RISCV64_ZVL128B; +#else + return NULL; +#endif + +#endif // !defined(OS_LINUX) +} + +static gotoblas_t* force_coretype(char* coretype) { + size_t i; + char message[128]; + + for (i = 0; i < NUM_CORETYPES && strcasecmp(coretype, cpuname[i]); i++); + + if (i == CPU_GENERIC) + return &gotoblas_RISCV64_GENERIC; + + if (i == CPU_RISCV64_ZVL256B) { +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) + return &gotoblas_RISCV64_ZVL256B; +#else + openblas_warning(1, + "riscv64_zvl256b support not compiled in\n"); + return NULL; +#endif + } + + if (i == CPU_RISCV64_ZVL128B) { +#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) + return &gotoblas_RISCV64_ZVL128B; +#else + openblas_warning(1, + "riscv64_zvl128b support not compiled in\n"); + return NULL; +#endif + } + + snprintf(message, sizeof(message), "Core not found: %s\n", coretype); + openblas_warning(1, message); + + return NULL; +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char* p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if (p) + gotoblas = force_coretype(p); + else + gotoblas = get_coretype(); + + if (!gotoblas) { + snprintf(coremsg, sizeof(coremsg), "Falling back to generic riscv64 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_RISCV64_GENERIC; + } + + if (gotoblas->init) { + snprintf(coremsg, sizeof(coremsg), "Core: %s\n", + gotoblas_corename()); + openblas_warning(2, coremsg); + gotoblas->init(); + return; + } + + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/memory.c b/driver/others/memory.c index 4ee8f9a2..6343a378 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -964,7 +964,9 @@ static void *alloc_shm(void *address){ return map_address; } -#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS +#endif + +#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) static void alloc_hugetlb_free(struct alloc_t *alloc_info){ @@ -1066,7 +1068,8 @@ static void *alloc_hugetlb(void *address){ } #endif -#endif + + #ifdef ALLOC_HUGETLBFILE @@ -1165,11 +1168,10 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER alloc_devicedirver, #endif -/* Hugetlb implicitly assumes ALLOC_SHM */ -#ifdef ALLOC_SHM +#ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB) alloc_shm, #endif -#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) +#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) alloc_hugetlb, #endif #ifdef ALLOC_MMAP @@ -1190,7 +1192,6 @@ void *blas_memory_alloc(int procpos){ struct alloc_t * alloc_info; struct alloc_t ** alloc_table; - #if defined(SMP) && !defined(USE_OPENMP) int mi; LOCK_COMMAND(&alloc_lock); @@ -1219,7 +1220,7 @@ UNLOCK_COMMAND(&alloc_lock); if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif @@ -1282,7 +1283,7 @@ UNLOCK_COMMAND(&alloc_lock); } #endif -#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) +#if (defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif @@ -2494,7 +2495,7 @@ static void *alloc_devicedirver(void *address){ #endif -#ifdef ALLOC_SHM +#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB) static void alloc_shm_free(struct release_t *release){ @@ -2506,7 +2507,9 @@ static void alloc_shm_free(struct release_t *release){ static void *alloc_shm(void *address){ void *map_address; int shmid; - +#ifdef DEBUG + fprintf(stderr,"alloc_shm got called\n"); +#endif shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); map_address = (void *)shmat(shmid, address, 0); @@ -2533,6 +2536,7 @@ static void *alloc_shm(void *address){ return map_address; } +#endif #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS @@ -2562,6 +2566,10 @@ static void *alloc_hugetlb(void *address){ void *map_address = (void *)-1; +#ifdef DEBUG +fprintf(stderr,"alloc_hugetlb got called\n"); +#endif + #if defined(OS_LINUX) || defined(OS_AIX) int shmid; @@ -2583,7 +2591,7 @@ static void *alloc_hugetlb(void *address){ if (map_address != (void *)-1){ shmctl(shmid, IPC_RMID, 0); - } + }else printf("alloc_hugetlb failed\n"); } #endif @@ -2645,7 +2653,6 @@ static void *alloc_hugetlb(void *address){ } #endif -#endif #ifdef ALLOC_HUGETLBFILE @@ -2739,7 +2746,7 @@ struct newmemstruct }; static volatile struct newmemstruct *newmemory; -static int memory_initialized = 0; +static volatile int memory_initialized = 0; static int memory_overflowed = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ @@ -2762,11 +2769,10 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER alloc_devicedirver, #endif -/* Hugetlb implicitly assumes ALLOC_SHM */ -#ifdef ALLOC_SHM +#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB) alloc_shm, #endif -#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) +#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) alloc_hugetlb, #endif #ifdef ALLOC_MMAP @@ -2785,14 +2791,12 @@ void *blas_memory_alloc(int procpos){ }; void *(**func)(void *address); -#if defined(USE_OPENMP) if (!memory_initialized) { +#if defined(SMP) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); + if (!memory_initialized) { #endif - LOCK_COMMAND(&alloc_lock); - - if (!memory_initialized) { - #if defined(WHEREAMI) && !defined(USE_OPENMP) for (position = 0; position < NUM_BUFFERS; position ++){ memory[position].addr = (void *)0; @@ -2814,19 +2818,19 @@ void *blas_memory_alloc(int procpos){ if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif #endif memory_initialized = 1; - + WMB; +#if defined(SMP) && !defined(USE_OPENMP) } UNLOCK_COMMAND(&alloc_lock); -#if defined(USE_OPENMP) - } #endif +} #ifdef DEBUG printf("Alloc Start ...\n"); @@ -2945,8 +2949,22 @@ void *blas_memory_alloc(int procpos){ } #endif -#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) +#if (defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#ifdef DEBUG + if (hugetlb_allocated) printf("allocating via shared memory with large page support (hugetlb)\n"); +#endif +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) +#ifdef DEBUG + printf("allocating via shared memory\n"); +#endif + if ((*func == alloc_shm) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n"); +#endif + } #endif func ++; @@ -3061,10 +3079,23 @@ void *blas_memory_alloc(int procpos){ } #endif -#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) +#if (defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) +#ifdef DEBUG + fprintf(stderr,"OpenBLAS: allocating via shared memory with large page support (hugetlb)\n"); +#endif if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) +#ifdef DEBUG + fprintf(stderr,"allocating via shared memory\n"); +#endif + if ((*func == alloc_shm) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n"); +#endif + } +#endif func ++; } diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 867d0e36..ff52cfba 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -13,9 +13,9 @@ modification, are permitted provided that the following conditions are notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -63,6 +63,9 @@ static char* openblas_config_str="" #ifdef USE_TLS "USE_TLS " #endif +#ifdef USE_LOCKING + "USE_LOCKING " +#endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif @@ -83,7 +86,7 @@ char tmpstr[20]; #endif if (openblas_get_parallel() == 0) sprintf(tmpstr, " SINGLE_THREADED"); - else + else snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); strcat(tmp_config_str, tmpstr); return tmp_config_str; @@ -91,7 +94,7 @@ char tmpstr[20]; char* openblas_get_corename(void) { -#ifndef DYNAMIC_ARCH +#ifndef DYNAMIC_ARCH return CHAR_CORENAME; #else return gotoblas_corename(); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index de6bf0de..a208a1a9 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -739,6 +739,100 @@ void blas_set_parameter(void){ } #endif +#if defined(ARCH_LOONGARCH64) +int get_L3_size() { + int ret = 0, id = 0x14; + __asm__ volatile ( + "cpucfg %[ret], %[id]" + : [ret]"=r"(ret) + : [id]"r"(id) + : "memory" + ); + return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB +} + +void blas_set_parameter(void){ +#if defined(LOONGSON3R5) + int L3_size = get_L3_size(); +#ifdef SMP + if(blas_num_threads == 1){ +#endif + //single thread + if (L3_size == 32){ // 3C5000 and 3D5000 + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 8192; + + dgemm_p = 112; + dgemm_q = 289; + dgemm_r = 4096; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 4096; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 2048; + } else { // 3A5000 and 3C5000L + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 4096; + + dgemm_p = 112; + dgemm_q = 300; + dgemm_r = 3024; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 2048; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 1024; + } +#ifdef SMP + }else{ + //multi thread + if (L3_size == 32){ // 3C5000 and 3D5000 + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 1024; + + dgemm_p = 112; + dgemm_q = 289; + dgemm_r = 342; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 512; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 512; + } else { // 3A5000 and 3C5000L + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 2048; + + dgemm_p = 112; + dgemm_q = 300; + dgemm_r = 738; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 1024; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 1024; + } + } +#endif +#endif +} +#endif + #if defined(ARCH_ARM64) void blas_set_parameter(void) diff --git a/exports/Makefile b/exports/Makefile index 4d929c8d..668a4866 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -137,7 +137,7 @@ libgoto_hpl.def : $(GENSYM) ifeq ($(OSNAME), Darwin) ifeq ($(FIXED_LIBNAME),1) -INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib +INTERNALNAME = $(LIBPREFIX).dylib else INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib endif @@ -178,7 +178,7 @@ FEXTRALIB += -lm EXTRALIB += -lm else ifeq ($(FIXED_LIBNAME),1) -INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so +INTERNALNAME = $(LIBPREFIX).so else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif @@ -315,11 +315,6 @@ test : linktest.c linktest.c : $(GENSYM) ../Makefile.system ../getarch.c ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c -ifeq ($(F_COMPILER), IBM) - mv linktest.c linktest.c.FIRST - egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c - rm linktest.c.FIRST -endif clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 60298508..f3f7b9fc 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -60,7 +60,7 @@ cblasobjsc=" cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy - cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin + cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin cblas_cgemm_batch " cblasobjsd=" cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot @@ -70,7 +70,7 @@ cblasobjsd=" cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy - cblas_damax cblas_damin + cblas_damax cblas_damin cblas_dgemm_batch " cblasobjss=" @@ -82,7 +82,7 @@ cblasobjss=" cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy - cblas_samax cblas_samin + cblas_samax cblas_samin cblas_sgemm_batch " cblasobjsz=" @@ -94,12 +94,12 @@ cblasobjsz=" cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy - cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin + cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin cblas_zgemm_batch " cblasobjs="cblas_xerbla" -bfcblasobjs="cblas_sbgemm cblas_sbgemv cblas_sbdot cblas_sbstobf16 cblas_sbdtobf16 cblas_sbf16tos cblas_dbf16tod" +bfcblasobjs="cblas_sbgemm cblas_sbgemv cblas_sbdot cblas_sbstobf16 cblas_sbdtobf16 cblas_sbf16tos cblas_dbf16tod cblas_sbgemm_batch" exblasobjs=" qamax qamin qasum qaxpy qcabs1 qcopy qdot qgbmv qgemm diff --git a/f_check b/f_check index 81f598ff..93c5962d 100755 --- a/f_check +++ b/f_check @@ -86,7 +86,7 @@ else vendor=CRAY openmp='-fopenmp' ;; - *Arm\ F90*) + *Arm\ F90*|*F90\ Flang*) vendor=FLANG openmp='-fopenmp' ;; diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 55374674..449072ba 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -97,6 +97,9 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) #sdsdot, dsdot if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") + if(CBLAS_FLAG EQUAL 1) + GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" ${CBLAS_FLAG} "" "" false) +endif () endif () if (BUILD_DOUBLE) GenerateNamedObjects("dsdot.c" "" "dsdot" ${CBLAS_FLAG} "" "" true "SINGLE") @@ -125,13 +128,16 @@ if (BUILD_BFLOAT16) GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") + if(CBLAS_FLAG EQUAL 1) + GenerateNamedObjects("gemm_batch.c" "" "sbgemm_batch" ${CBLAS_FLAG} "" "" true "BFLOAT16") +endif () endif () # complex-specific sources foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") - GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zaxpy.c" "CONJ" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) @@ -154,6 +160,9 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") + if(CBLAS_FLAG EQUAL 1) + GenerateNamedObjects("gemm_batch.c" "" "cgemm_batch" ${CBLAS_FLAG} "" "" true "COMPLEX") + endif () endif () if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") @@ -163,6 +172,9 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + if(CBLAS_FLAG EQUAL 1) + GenerateNamedObjects("gemm_batch.c" "" "zgemm_batch" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + endif () endif () endforeach () @@ -212,6 +224,7 @@ if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "SINGLE") GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "SINGLE") GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "SINGLE") + GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" 1 "" "" false "SINGLE") GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "SINGLE") GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "SINGLE") GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "SINGLE") @@ -225,6 +238,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "DOUBLE") GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "DOUBLE") GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "DOUBLE") + GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" 1 "" "" false "DOUBLE") GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "DOUBLE") GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "DOUBLE") GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "DOUBLE") diff --git a/interface/Makefile b/interface/Makefile index 57b2b697..b6684c8f 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -282,12 +282,12 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) + cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -308,7 +308,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) + cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -333,7 +333,7 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX) CXERBLAOBJ = \ cblas_xerbla.$(SUFFIX) @@ -364,7 +364,7 @@ CZBLAS3OBJS = \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) + cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -2419,6 +2419,21 @@ cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) +cblas_sbgemm_batch.$(SUFFIX) cblas_sbgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_sgemm_batch.$(SUFFIX) cblas_sgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_dgemm_batch.$(SUFFIX) cblas_dgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_cgemm_batch.$(SUFFIX) cblas_cgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zgemm_batch.$(SUFFIX) cblas_zgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + # The list of symbols to be removed can be seen in the diff between LAPACK's # original SRC/Makefile and the version of that same file that is included in # OpenBLAS (unfiltered) tarball diff --git a/interface/gbmv.c b/interface/gbmv.c index 1d58ba80..7a658136 100644 --- a/interface/gbmv.c +++ b/interface/gbmv.c @@ -227,7 +227,10 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (m * n < 250000 || kl+ku < 15 ) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/interface/gemm.c b/interface/gemm.c index 0902bc02..64b8b620 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -1,4 +1,5 @@ /*********************************************************************/ +/* Copyright 2024 The OpenBLAS Project */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -47,12 +48,16 @@ #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " +#define GEMV BLASFUNC(qgemv) #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " +#define GEMV BLASFUNC(dgemv) #elif defined(BFLOAT16) #define ERROR_NAME "SBGEMM " +#define GEMV BLASFUNC(sbgemv) #else #define ERROR_NAME "SGEMM " +#define GEMV BLASFUNC(sgemv) #endif #else #define SMP_THRESHOLD_MIN 8192.0 @@ -493,6 +498,52 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS args.m, args.n, args.k, args.lda, args.ldb, args.ldc); #endif +#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16) + // Check if we can convert GEMM -> GEMV + if (args.k != 0) { + if (args.n == 1) { + blasint inc_x = 1; + blasint inc_y = 1; + // These were passed in as blasint, but the struct translates them to blaslong + blasint m = args.m; + blasint n = args.k; + blasint lda = args.lda; + // Create new transpose parameters + char NT = 'N'; + if (transa & 1) { + NT = 'T'; + m = args.k; + n = args.m; + } + if (transb & 1) { + inc_x = args.ldb; + } + GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y); + return; + } + if (args.m == 1) { + blasint inc_x = args.lda; + blasint inc_y = args.ldc; + // These were passed in as blasint, but the struct translates them to blaslong + blasint m = args.k; + blasint n = args.n; + blasint ldb = args.ldb; + // Create new transpose parameters + char NT = 'T'; + if (transa & 1) { + inc_x = 1; + } + if (transb & 1) { + NT = 'N'; + m = args.n; + n = args.k; + } + GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y); + return; + } + } +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -521,7 +572,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS buffer = (XFLOAT *)blas_memory_alloc(0); +//For target LOONGSON3R5, applying an offset to the buffer is essential +//for minimizing cache conflicts and optimizing performance. +#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) + sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); +#else sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); +#endif sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP diff --git a/interface/gemm_batch.c b/interface/gemm_batch.c new file mode 100644 index 00000000..56ccc12c --- /dev/null +++ b/interface/gemm_batch.c @@ -0,0 +1,372 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include +#include +#include "common.h" + +void openblas_warning(int verbose, const char * msg); + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QGEMM_BATCH " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMM_BATCH " +#define GEMM_BATCH_THREAD dgemm_batch_thread +#else +#define ERROR_NAME "SGEMM_BATCH " +#define GEMM_BATCH_THREAD sgemm_batch_thread +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XGEMM_BATCH " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMM_BATCH " +#define GEMM_BATCH_THREAD zgemm_batch_thread +#else +#define ERROR_NAME "CGEMM_BATCH " +#define GEMM_BATCH_THREAD cgemm_batch_thread +#endif +#endif +static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { + GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, + GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, + GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, + GEMM_NC, GEMM_TC, GEMM_RC, GEMM_CC, +}; + +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) +#define USE_SMALL_MATRIX_OPT 1 +#else +#define USE_SMALL_MATRIX_OPT 0 +#endif + +#if USE_SMALL_MATRIX_OPT +#ifndef DYNAMIC_ARCH +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) +#else +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx])))) +#endif + + +#ifndef COMPLEX +static size_t gemm_small_kernel[] = { + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, +}; + + +static size_t gemm_small_kernel_b0[] = { + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, +}; + +#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) +#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) +#else + +static size_t zgemm_small_kernel[] = { + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, + GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, + GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, +}; + +static size_t zgemm_small_kernel_b0[] = { + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, + GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, + GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, +}; + +#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) +#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) +#endif +#endif + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CBLAS_TRANSPOSE * transb_array, + blasint * m_array, blasint * n_array, blasint * k_array, +#ifndef COMPLEX + FLOAT * alpha_array, + IFLOAT ** a_array, blasint * lda_array, + IFLOAT ** b_array, blasint * ldb_array, + FLOAT * beta_array, + FLOAT ** c_array, blasint * ldc_array, blasint group_count, blasint * group_size) { +#else + void * valpha_array, + void ** va_array, blasint * lda_array, + void ** vb_array, blasint * ldb_array, + void * vbeta_array, + void ** vc_array, blasint * ldc_array, blasint group_count, blasint * group_size) { + + FLOAT * alpha_array=(FLOAT *)valpha_array; + FLOAT * beta_array=(FLOAT *)vbeta_array; + FLOAT ** a_array=(FLOAT**)va_array; + FLOAT ** b_array=(FLOAT**)vb_array; + FLOAT ** c_array=(FLOAT**)vc_array; + +#endif + blas_arg_t * args_array=NULL; + + int mode=0, group_mode=0; + blasint total_num=0; + + blasint i=0, j=0, matrix_idx=0, count=0; + + int group_transa, group_transb; + BLASLONG group_nrowa, group_nrowb; + blasint info; + + void * group_alpha, * group_beta; + BLASLONG group_m, group_n, group_k; + BLASLONG group_lda, group_ldb, group_ldc; + void * group_routine=NULL; +#ifdef SMALL_MATRIX_OPT + void * group_small_matrix_opt_routine=NULL; +#endif + +#if defined (SMP) || defined(SMALL_MATRIX_OPT) + double MNK; +#endif + + PRINT_DEBUG_CNAME; + + for(i=0; i= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + free(args_array); + return; + } + + if (group_m == 0 || group_n == 0) continue; + + group_mode=mode; + +#if defined(SMP) || defined(SMALL_MATRIX_OPT) + MNK = (double) group_m * (double) group_n * (double) group_k; +#endif + +#ifdef SMALL_MATRIX_OPT + if (MNK <= 100.0*100.0*100.0){ + group_routine=NULL; +#if !defined(COMPLEX) + if(*(FLOAT *)(group_beta) == 0.0){ + group_mode=mode | BLAS_SMALL_B0_OPT; + group_small_matrix_opt_routine=(void *)(gemm_small_kernel_b0[(group_transb<<2)|group_transa]); + }else{ + group_mode=mode | BLAS_SMALL_OPT; + group_small_matrix_opt_routine=(void *)(gemm_small_kernel[(group_transb<<2)|group_transa]); + } +#else + if(((FLOAT *)(group_beta))[0] == 0.0 && ((FLOAT *)(group_beta))[1] == 0.0){ + group_mode=mode | BLAS_SMALL_B0_OPT; + group_small_matrix_opt_routine=(void *)(zgemm_small_kernel_b0[(group_transb<<2)|group_transa]); + }else{ + group_mode=mode | BLAS_SMALL_OPT; + group_small_matrix_opt_routine=(void *)(zgemm_small_kernel[(group_transb<<2)|group_transa]); + } + +#endif + + }else{ +#endif + group_routine=(void*)(gemm[(group_transb<<2)|group_transa]); +#ifdef SMALL_MATRIX_OPT + } +#endif + + + for(j=0; j0){ + GEMM_BATCH_THREAD(args_array,count); + } + + free(args_array); +} diff --git a/interface/scal.c b/interface/scal.c index 0a7fee64..c6638a62 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -85,7 +85,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ if (nthreads == 1) { #endif - SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0); + SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1); #ifdef SMP } else { @@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #else &alpha, #endif - x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); + x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads); } #endif diff --git a/interface/zgbmv.c b/interface/zgbmv.c index 5e275a8e..5128b22e 100644 --- a/interface/zgbmv.c +++ b/interface/zgbmv.c @@ -251,7 +251,10 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (m * n < 125000 || ku + kl < 15) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 863f376e..ed1c74ec 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -17,6 +17,7 @@ ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif + ifeq ($(ARCH), arm) USE_TRMM = 1 endif diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c index 4ef49e29..6a2c3763 100644 --- a/kernel/arm/scal.c +++ b/kernel/arm/scal.c @@ -43,18 +43,36 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (n <= 0) || (inc_x <= 0)) return(0); + if (dummy2 == 0) { + while(j < n) + { - while(j < n) - { + if ( da == 0.0 ) + x[i]=0.0; + else + x[i] = da * x[i] ; + + i += inc_x ; + j++; + } + } else { + + while(j < n) + { if ( da == 0.0 ) + if (!isnan(x[i]) && !isinf(x[i])) { x[i]=0.0; + } else { + x[i]=NAN; + } else x[i] = da * x[i] ; i += inc_x ; j++; + } } return 0; diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index b2d537d0..c4855f73 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -61,7 +61,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F { temp = - da_i * x[ip+1] ; if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; - x[ip+1] = da_i * x[ip] ; + if (!isinf(x[ip+1])) + x[ip+1] = da_i * x[ip] ; + else x[ip+1] = NAN; } } else diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index bc599909..4abc8404 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1 +1,6 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE + +SGEMVNKERNEL = gemv_n_sve.c +DGEMVNKERNEL = gemv_n_sve.c +SGEMVTKERNEL = gemv_t_sve.c +DGEMVTKERNEL = gemv_t_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index eeb4844b..bfadf5cb 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -131,6 +131,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_sve.c +SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_sve.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_sve.c +SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_sve.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_sve.c +SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_sve.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_sve.c +SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_sve.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_sve.c + STRMMUNCOPY_M = trmm_uncopy_sve_v1.c STRMMLNCOPY_M = trmm_lncopy_sve_v1.c STRMMUTCOPY_M = trmm_utcopy_sve_v1.c @@ -152,6 +162,16 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_sve.c +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_sve.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_sve.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_sve.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_sve.c +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_sve.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_sve.c +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_sve.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_sve.c + DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index 9fe981c5..5b317447 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -93,8 +93,8 @@ IZAMAXKERNEL = izamax_thunderx2t99.c SNRM2KERNEL = nrm2.S DNRM2KERNEL = nrm2.S -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.c SDOTKERNEL = dot.c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index bc599909..53d157a0 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1 +1,4 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE + +SGEMVTKERNEL = gemv_t_sve.c +DGEMVTKERNEL = gemv_t_sve.c diff --git a/kernel/arm64/KERNEL.NEOVERSEV2 b/kernel/arm64/KERNEL.NEOVERSEV2 new file mode 100644 index 00000000..bc599909 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEV2 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/dgemm_small_kernel_nn_sve.c b/kernel/arm64/dgemm_small_kernel_nn_sve.c new file mode 100644 index 00000000..fa39103d --- /dev/null +++ b/kernel/arm64/dgemm_small_kernel_nn_sve.c @@ -0,0 +1,742 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr; +#define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; +#define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] +#define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR2(m, n) \ + float64x2_t result##m##n = vdupq_n_f64(0.0); +#define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; +#define BROADCAST_LOAD_A2(m, offset_k) \ + float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define VECTOR_LOAD_B_K2(n, offset_k) \ + float64x2_t b##k##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); +#define TRANSPOSE_B2_K2(n0, n1, offset_k0, offset_k1) \ + float64x2_t b##n0##_k##offset_k0 = \ + vzip1q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ + float64x2_t b##n0##_k##offset_k1 = \ + vzip2q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); + +#define SCALE_B2_K2(n0, offset_k0, offset_k1) \ + svfloat64_t b##s##n0##_k##offset_k0 = svdup_neonq_f64(b##n0##_k##offset_k0); \ + svfloat64_t b##s##n0##_k##offset_k1 = svdup_neonq_f64(b##n0##_k##offset_k1); +#define GATHER_LOAD_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); +#define VECTOR_UNPACK_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); +#define VECTOR_PACK_B2(n, offset_k) \ + vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); +#define PACK_B0(n, offset_k) \ + PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); +#define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ + result##m##n = \ + vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define SCATTER_STORE2(m, n) \ + result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ + C_ELEMENT(m, n + 0) = vgetq_lane_f64(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = vgetq_lane_f64(result##m##n, 1); +#else +#define SCATTER_STORE2(m, n) \ + result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ + C_ELEMENT(m, n + 0) = \ + C_ELEMENT(m, n + 0) * beta + vgetq_lane_f64(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = \ + C_ELEMENT(m, n + 1) * beta + vgetq_lane_f64(result##m##n, 1); +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define QUADWORD_LOAD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); +#define PACK_B(n, offset_k) \ + svst1(pg_first, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); +#define VECTOR_PACK_B(n, offset_k) \ + svst1(pg_true, &PACK_ELEMENT_K(n* v_size, offset_k), b##s##n##_k##offset_k); +#define QUADWORD_PACK_B(n, offset_k) \ + svst1(pg_quad, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); +#define UNPACK_VECTOR_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = \ + svld1(pg_true, &PACK_ELEMENT_K(n * v_size, offset_k)); +#define UNPACK_BROADCAST_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = svdup_f64(PACK_ELEMENT_K(n, offset_k)); +#define UNPACK_QUADWORD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &PACK_ELEMENT_K(n, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntd(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b64(); + const svbool_t pg_quad = svwhilelt_b64(0, 2); + const svbool_t pg_first = svwhilelt_b64(0, 1); + const svfloat64_t alpha_vec = svdup_f64(alpha); +#ifndef B0 + const svfloat64_t beta_vec = svdup_f64(beta); +#endif + const BLASLONG n4 = N & -4; + const BLASLONG n2 = N & -2; + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + const BLASLONG k2 = K & -2; + + const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + FLOAT* packed_b = + (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; + + FLOAT* b_offset = B; + FLOAT* a_offset = A; + FLOAT* c_offset = C; + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, 1); + CREATE_C_POINTER(2, 2); + CREATE_C_POINTER(3, 3); + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_b != NULL)) { + if (i == 0) { + for (; k < k2; k += 2) { + + VECTOR_LOAD_B_K2(0, 0); + VECTOR_LOAD_B_K2(1, 0); + TRANSPOSE_B2_K2(0, 1, 0, 1); + SCALE_B2_K2(0, 0, 1); + VECTOR_PACK_B2(0, 0); + VECTOR_PACK_B2(0, 1); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + VECTOR_LOAD_B_K2(2, 0); + VECTOR_LOAD_B_K2(3, 0); + TRANSPOSE_B2_K2(2, 3, 0, 1); + SCALE_B2_K2(2, 0, 1); + VECTOR_PACK_B2(2, 0); + VECTOR_PACK_B2(2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 1); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + PACK_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + PACK_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + PACK_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + PACK_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } else { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UNPACK_QUADWORD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); + } + } + } else { + for (; k < k2; k += 2) { + + VECTOR_LOAD_B_K2(0, 0); + VECTOR_LOAD_B_K2(1, 0); + TRANSPOSE_B2_K2(0, 1, 0, 1); + SCALE_B2_K2(0, 0, 1); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + VECTOR_LOAD_B_K2(2, 0); + VECTOR_LOAD_B_K2(3, 0); + TRANSPOSE_B2_K2(2, 3, 0, 1); + SCALE_B2_K2(2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 1); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + INCR_C_POINTER(2, v_size2); + INCR_C_POINTER(3, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UNPACK_QUADWORD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + } + } else { + for (; k < k2; k += 2) { + + VECTOR_LOAD_B_K2(0, 0); + VECTOR_LOAD_B_K2(1, 0); + TRANSPOSE_B2_K2(0, 1, 0, 1); + SCALE_B2_K2(0, 0, 1); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + VECTOR_LOAD_B_K2(2, 0); + VECTOR_LOAD_B_K2(3, 0); + TRANSPOSE_B2_K2(2, 3, 0, 1); + SCALE_B2_K2(2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, v_size); + INCR_C_POINTER(1, v_size); + INCR_C_POINTER(2, v_size); + INCR_C_POINTER(3, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UNPACK_QUADWORD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + } + } else { + for (; k < k2; k += 2) { + + VECTOR_LOAD_B_K2(0, 0); + VECTOR_LOAD_B_K2(1, 0); + TRANSPOSE_B2_K2(0, 1, 0, 1); + SCALE_B2_K2(0, 0, 1); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_tail, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + VECTOR_LOAD_B_K2(2, 0); + VECTOR_LOAD_B_K2(3, 0); + TRANSPOSE_B2_K2(2, 3, 0, 1); + SCALE_B2_K2(2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); + } + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 0); + INCR_C_POINTER(1, 0); + INCR_C_POINTER(2, 0); + INCR_C_POINTER(3, 0); + } + + UPDATE_B_POINTER(4); + RESET_A_POINTER(); + UPDATE_C_POINTER(4); + } + for (; j < n2; j += 2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, 1); + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + + for (; k < k2; k += 2) { + + VECTOR_LOAD_B_K2(0, 0); + VECTOR_LOAD_B_K2(1, 0); + TRANSPOSE_B2_K2(0, 1, 0, 1); + SCALE_B2_K2(0, 0, 1); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < k2; k += 2) { + + VECTOR_LOAD_B_K2(0, 0); + VECTOR_LOAD_B_K2(1, 0); + TRANSPOSE_B2_K2(0, 1, 0, 1); + SCALE_B2_K2(0, 0, 1); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + INCR_C_POINTER(0, v_size); + INCR_C_POINTER(1, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < k2; k += 2) { + + VECTOR_LOAD_B_K2(0, 0); + VECTOR_LOAD_B_K2(1, 0); + TRANSPOSE_B2_K2(0, 1, 0, 1); + SCALE_B2_K2(0, 0, 1); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_tail, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + INCR_C_POINTER(0, 0); + INCR_C_POINTER(1, 0); + } + + UPDATE_B_POINTER(2); + RESET_A_POINTER(); + UPDATE_C_POINTER(2); + } + for (; j < N; j++) { + + CREATE_C_POINTER(0, 0); + CREATE_B_POINTER(0, 0); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 0); + } + + UPDATE_B_POINTER(1); + RESET_A_POINTER(); + UPDATE_C_POINTER(1); + } + + if (pack_b) + free(packed_b); + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/dgemm_small_kernel_nt_sve.c b/kernel/arm64/dgemm_small_kernel_nt_sve.c new file mode 100644 index 00000000..0b306e75 --- /dev/null +++ b/kernel/arm64/dgemm_small_kernel_nt_sve.c @@ -0,0 +1,474 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr; +#define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; +#define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] +#define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR2(m, n) \ + float64x2_t result##m##n = vdupq_n_f64(0.0); +#define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; +#define BROADCAST_LOAD_A2(m, offset_k) \ + float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define VECTOR_LOAD_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); +#define GATHER_LOAD_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); +#define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ + result##m##n = \ + vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define SCATTER_STORE2(m, n) \ + result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ + C_ELEMENT(m, n + 0) = vgetq_lane_f64(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = vgetq_lane_f64(result##m##n, 1); +#else +#define SCATTER_STORE2(m, n) \ + result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ + C_ELEMENT(m, n + 0) = \ + C_ELEMENT(m, n + 0) * beta + vgetq_lane_f64(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = \ + C_ELEMENT(m, n + 1) * beta + vgetq_lane_f64(result##m##n, 1); +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define QUADWORD_LOAD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntd(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b64(); + const svbool_t pg_quad = svwhilelt_b64(0, 2); + const svfloat64_t alpha_vec = svdup_f64(alpha); +#ifndef B0 + const svfloat64_t beta_vec = svdup_f64(beta); +#endif + const BLASLONG n4 = N & -4; + const BLASLONG n2 = N & -2; + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + + FLOAT* b_offset = B; + FLOAT* a_offset = A; + FLOAT* c_offset = C; + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, 1); + CREATE_C_POINTER(2, 2); + CREATE_C_POINTER(3, 3); + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + INCR_C_POINTER(2, v_size2); + INCR_C_POINTER(3, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, v_size); + INCR_C_POINTER(1, v_size); + INCR_C_POINTER(2, v_size); + INCR_C_POINTER(3, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 0); + INCR_C_POINTER(1, 0); + INCR_C_POINTER(2, 0); + INCR_C_POINTER(3, 0); + } + + UPDATE_B_POINTER(4); + RESET_A_POINTER(); + UPDATE_C_POINTER(4); + } + for (; j < n2; j += 2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, 1); + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + INCR_C_POINTER(0, v_size); + INCR_C_POINTER(1, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + INCR_C_POINTER(0, 0); + INCR_C_POINTER(1, 0); + } + + UPDATE_B_POINTER(2); + RESET_A_POINTER(); + UPDATE_C_POINTER(2); + } + for (; j < N; j++) { + + CREATE_C_POINTER(0, 0); + CREATE_B_POINTER(0, 0); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 0); + } + + UPDATE_B_POINTER(1); + RESET_A_POINTER(); + UPDATE_C_POINTER(1); + } + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/dgemm_small_kernel_tn_sve.c b/kernel/arm64/dgemm_small_kernel_tn_sve.c new file mode 100644 index 00000000..daca8e1b --- /dev/null +++ b/kernel/arm64/dgemm_small_kernel_tn_sve.c @@ -0,0 +1,571 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; +#define UPDATE_C_POINTER(scale) c_offset += scale; +#define C_ELEMENT(m, n) \ + *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] +#define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR2(m, n) \ + float64x2_t result##m##n = vdupq_n_f64(0.0); +#define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; +#define BROADCAST_LOAD_A2(m, offset_k) \ + float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define GATHER_LOAD_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); +#define VECTOR_UNPACK_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); +#define PACK_B0(n, offset_k) \ + PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); +#define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ + result##m##n = \ + vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define SCATTER_STORE2(m, n) \ + result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ + C_ELEMENT(m, n + 0) = vgetq_lane_f64(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = vgetq_lane_f64(result##m##n, 1); +#else +#define SCATTER_STORE2(m, n) \ + result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ + C_ELEMENT(m, n + 0) = \ + C_ELEMENT(m, n + 0) * beta + vgetq_lane_f64(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = \ + C_ELEMENT(m, n + 1) * beta + vgetq_lane_f64(result##m##n, 1); +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define GATHER_LOAD_A(pg, m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = \ + svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); +#define PACK_A(m, offset_k) \ + svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define VECTOR_PACK_A(m, offset_k) \ + svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); +#define QUADWORD_PACK_A(m, offset_k) \ + svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define UNPACK_VECTOR_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = \ + svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); +#define UNPACK_BROADCAST_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svdup_f64(PACK_ELEMENT_K(m, offset_k)); +#define UNPACK_QUADWORD_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = \ + svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntd(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b64(); + const svbool_t pg_quad = svwhilelt_b64(0, 2); + const svbool_t pg_first = svwhilelt_b64(0, 1); + const svfloat64_t alpha_vec = svdup_f64(alpha); +#ifndef B0 + const svfloat64_t beta_vec = svdup_f64(beta); +#endif + const svuint64_t lda_vec = svindex_u64(0LL, lda); + + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + const BLASLONG n4 = N & -4; + const BLASLONG n2 = N & -2; + + const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + FLOAT* packed_a = + (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; + + FLOAT* a_offset = A; + FLOAT* b_offset = B; + FLOAT* c_offset = C; + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, v_size); + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_a != NULL)) { + if (j == 0) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + VECTOR_PACK_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + VECTOR_PACK_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, 4); + INCR_C_POINTER(1, 4); + } + for (; j < n2; j += 2) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + UPDATE_B_POINTER(2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + INCR_C_POINTER(0, 2); + INCR_C_POINTER(1, 2); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, 1); + INCR_C_POINTER(1, 1); + } + + UPDATE_A_POINTER(v_size2); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < n2; j += 2) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + UPDATE_B_POINTER(2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + INCR_C_POINTER(0, 2); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(v_size); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < n2; j += 2) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + UPDATE_B_POINTER(2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + INCR_C_POINTER(0, 2); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(0); + RESET_B_POINTER(); + UPDATE_C_POINTER(0); + } + + if (pack_a) + free(packed_a); + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/dgemm_small_kernel_tt_sve.c b/kernel/arm64/dgemm_small_kernel_tt_sve.c new file mode 100644 index 00000000..efe11a9f --- /dev/null +++ b/kernel/arm64/dgemm_small_kernel_tt_sve.c @@ -0,0 +1,564 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; +#define UPDATE_C_POINTER(scale) c_offset += scale; +#define C_ELEMENT(m, n) \ + *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] +#define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR2(m, n) \ + float64x2_t result##m##n = vdupq_n_f64(0.0); +#define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; +#define BROADCAST_LOAD_A2(m, offset_k) \ + float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define VECTOR_LOAD_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); +#define GATHER_LOAD_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); +#define VECTOR_UNPACK_B2(n, offset_k) \ + float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); +#define VECTOR_PACK_B2(n, offset_k) \ + vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); +#define PACK_B0(n, offset_k) \ + PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); +#define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ + result##m##n = \ + vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define VECTOR_STORE2(m, n) \ + vst1q_f64(&C_ELEMENT(m, n), vmulq_f64(result##m##n, vdupq_n_f64(alpha))); +#define STORE(m, n) C_ELEMENT(m, n) = alpha * result##m##n; +#else +#define VECTOR_STORE2(m, n) \ + result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ + result##m##n = \ + vfmaq_f64(result##m##n, vld1q_f64(&C_ELEMENT(m, n)), vdupq_n_f64(beta)); \ + vst1q_f64(&C_ELEMENT(m, n), result##m##n); +#define STORE(m, n) \ + C_ELEMENT(m, n) = C_ELEMENT(m, n) * beta + alpha * result##m##n; +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define QUADWORD_LOAD_B(n, offset_k) \ + svfloat64_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); +#define GATHER_LOAD_A(pg, m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = \ + svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); +#define PACK_A(m, offset_k) \ + svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define VECTOR_PACK_A(m, offset_k) \ + svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); +#define QUADWORD_PACK_A(m, offset_k) \ + svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define UNPACK_VECTOR_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = \ + svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); +#define UNPACK_BROADCAST_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = svdup_f64(PACK_ELEMENT_K(m, offset_k)); +#define UNPACK_QUADWORD_A(m, offset_k) \ + svfloat64_t a##s##m##_k##offset_k = \ + svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntd(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b64(); + const svbool_t pg_quad = svwhilelt_b64(0, 2); + const svbool_t pg_first = svwhilelt_b64(0, 1); + const svfloat64_t alpha_vec = svdup_f64(alpha); +#ifndef B0 + const svfloat64_t beta_vec = svdup_f64(beta); +#endif + const svuint64_t lda_vec = svindex_u64(0LL, lda); + + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + const BLASLONG n4 = N & -4; + const BLASLONG n2 = N & -2; + + const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + FLOAT* packed_a = + (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; + + FLOAT* a_offset = A; + FLOAT* b_offset = B; + FLOAT* c_offset = C; + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, v_size); + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_a != NULL)) { + if (j == 0) { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + VECTOR_PACK_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + VECTOR_PACK_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); + } + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, 4); + INCR_C_POINTER(1, 4); + } + for (; j < n2; j += 2) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + UPDATE_B_POINTER(2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + INCR_C_POINTER(0, 2); + INCR_C_POINTER(1, 2); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, 1); + INCR_C_POINTER(1, 1); + } + + UPDATE_A_POINTER(v_size2); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < n2; j += 2) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + UPDATE_B_POINTER(2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + INCR_C_POINTER(0, 2); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(v_size); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + QUADWORD_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < n2; j += 2) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + UPDATE_B_POINTER(2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + INCR_C_POINTER(0, 2); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(0); + RESET_B_POINTER(); + UPDATE_C_POINTER(0); + } + + if (pack_a) + free(packed_a); + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 16f4cd53..bc997521 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -108,7 +108,12 @@ dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) [N_] "r" (n), [X_] "r" (x), [Y_] "r" (y) - :); + : "cc", + "memory", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "d1", + "z0", "z1" + ); return ret; } diff --git a/kernel/arm64/gemm_small_kernel_permit_sve.c b/kernel/arm64/gemm_small_kernel_permit_sve.c new file mode 100644 index 00000000..3d425624 --- /dev/null +++ b/kernel/arm64/gemm_small_kernel_permit_sve.c @@ -0,0 +1,43 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + BLASLONG MNK = M * N * K; + +#if defined(DOUBLE) // dgemm + if (MNK <= 64*64*64) + return 1; +#else // sgemm + if (MNK <= 64*64*64) + return 1; +#endif + + return 0; +} diff --git a/kernel/arm64/gemv_n_sve.c b/kernel/arm64/gemv_n_sve.c new file mode 100644 index 00000000..29505556 --- /dev/null +++ b/kernel/arm64/gemv_n_sve.c @@ -0,0 +1,92 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_y == 1) { + uint64_t sve_size = SV_COUNT(); + for (j = 0; j < n; j++) { + SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); + i = 0; + svbool_t pg = SV_WHILE(i, m); + while (svptest_any(SV_TRUE(), pg)) { + SV_TYPE a_vec = svld1(pg, a_ptr + i); + SV_TYPE y_vec = svld1(pg, y + i); + y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); + svst1(pg, y + i, y_vec); + i += sve_size; + pg = SV_WHILE(i, m); + } + a_ptr += lda; + ix += inc_x; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +} diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index b04367ab..a98eef49 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -1,5 +1,5 @@ /******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project +Copyright (c) 2015, 2024 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -170,39 +170,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F32_FINALIZE #if !defined(DOUBLE) - fadd v1.4s, v1.4s, v2.4s + // F8 only has 2 accumulators + // so add into those pairs fadd v1.4s, v1.4s, v3.4s - fadd v1.4s, v1.4s, v4.4s -#else - fadd v1.2d, v1.2d, v2.2d - fadd v1.2d, v1.2d, v3.2d - fadd v1.2d, v1.2d, v4.2d + fadd v2.4s, v2.4s, v4.4s #endif .endm -.macro KERNEL_F4 +.macro KERNEL_F8 #if !defined(DOUBLE) - ld1 {v2.4s}, [A_PTR], #16 - ld1 {v3.4s}, [X_PTR], #16 - fmla v1.4s, v2.4s, v3.4s -#else - ld1 {v2.2d}, [A_PTR], #16 - ld1 {v3.2d}, [X_PTR], #16 - fmla v1.2d, v2.2d, v3.2d - - ld1 {v4.2d}, [A_PTR], #16 - ld1 {v5.2d}, [X_PTR], #16 - fmla v1.2d, v4.2d, v5.2d + ld1 {v13.4s, v14.4s}, [A_PTR], #32 + ld1 {v17.4s, v18.4s}, [X_PTR], #32 + fmla v1.4s, v13.4s, v17.4s + fmla v2.4s, v14.4s, v18.4s +#else + ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 + ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 + fmla v1.2d, v13.2d, v17.2d + fmla v2.2d, v14.2d, v18.2d + fmla v3.2d, v15.2d, v19.2d + fmla v4.2d, v16.2d, v20.2d #endif .endm -.macro KERNEL_F4_FINALIZE +.macro KERNEL_F8_FINALIZE #if !defined(DOUBLE) - ext v2.16b, v1.16b, v1.16b, #8 + // Take the top two elements of v1 and + // put them into the first two lanes of v3 + ext v3.16b, v1.16b, v1.16b, #8 + fadd v1.2s, v1.2s, v3.2s + ext v4.16b, v2.16b, v2.16b, #8 + fadd v2.2s, v2.2s, v4.2s + // Final pair fadd v1.2s, v1.2s, v2.2s faddp TEMP, v1.2s #else faddp TEMP, v1.2d + faddp TEMP1, v2.2d + faddp TEMP2, v3.2d + faddp TEMP3, v4.2d + fadd TEMP, TEMP, TEMP1 + fadd TEMP2, TEMP2, TEMP3 + fadd TEMP, TEMP, TEMP2 #endif .endm @@ -258,7 +267,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asr I, M, #5 cmp I, xzr - beq .Lgemv_t_kernel_F4 + beq .Lgemv_t_kernel_F8 .Lgemv_t_kernel_F320: @@ -269,24 +278,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL_F32_FINALIZE -.Lgemv_t_kernel_F4: +.Lgemv_t_kernel_F8: ands I, M, #31 - asr I, I, #2 + asr I, I, #3 cmp I, xzr beq .Lgemv_t_kernel_F1 -.Lgemv_t_kernel_F40: +.Lgemv_t_kernel_F80: - KERNEL_F4 + KERNEL_F8 subs I, I, #1 - bne .Lgemv_t_kernel_F40 + bne .Lgemv_t_kernel_F80 .Lgemv_t_kernel_F1: - KERNEL_F4_FINALIZE + KERNEL_F8_FINALIZE - ands I, M, #3 + ands I, M, #7 ble .Lgemv_t_kernel_F_END .Lgemv_t_kernel_F10: diff --git a/kernel/arm64/gemv_t_sve.c b/kernel/arm64/gemv_t_sve.c new file mode 100644 index 00000000..183d9c3d --- /dev/null +++ b/kernel/arm64/gemv_t_sve.c @@ -0,0 +1,120 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + iy = 0; + a_ptr = a; + + if (inc_x == 1) { + svbool_t pg_true = SV_TRUE(); + uint64_t sve_size = SV_COUNT(); + uint64_t sve_size2 = sve_size * 2; + BLASLONG m1 = m & -sve_size; + BLASLONG m2 = m & -sve_size2; + + for (j = 0; j < n; j++) { + BLASLONG i = 0; + + SV_TYPE temp_vec_v2_0 = SV_DUP(0.0); + SV_TYPE temp_vec_v2_1 = SV_DUP(0.0); + for (; i < m2; i += sve_size2) { + SV_TYPE a_vec0 = svld1(pg_true, a_ptr + i); + SV_TYPE x_vec0 = svld1(pg_true, x + i); + SV_TYPE a_vec1 = svld1(pg_true, a_ptr + i + sve_size); + SV_TYPE x_vec1 = svld1(pg_true, x + i + sve_size); + temp_vec_v2_0 = svmla_m(pg_true, temp_vec_v2_0, a_vec0, x_vec0); + temp_vec_v2_1 = svmla_m(pg_true, temp_vec_v2_1, a_vec1, x_vec1); + } + + SV_TYPE temp_vec_v1 = SV_DUP(0.0); + for (; i < m1; i += sve_size) { + SV_TYPE a_vec0 = svld1(pg_true, a_ptr + i); + SV_TYPE x_vec0 = svld1(pg_true, x + i); + temp_vec_v1 = svmla_m(pg_true, temp_vec_v1, a_vec0, x_vec0); + } + + SV_TYPE temp_vec = SV_DUP(0.0); + for (; i < m; i += sve_size) { + svbool_t pg = SV_WHILE(i, m); + SV_TYPE a_vec = svld1(pg, a_ptr + i); + SV_TYPE x_vec = svld1(pg, x + i); + temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec); + } + + y[iy] += alpha * ( + (svaddv(SV_TRUE(), temp_vec_v2_0) + svaddv(SV_TRUE(), temp_vec)) + + (svaddv(SV_TRUE(), temp_vec_v2_1) + svaddv(SV_TRUE(), temp_vec_v1)) + ); + + iy += inc_y; + a_ptr += lda; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = 0.0; + ix = 0; + for (i = 0; i < m; i++) { + temp += a_ptr[i] * x[ix]; + ix += inc_x; + } + y[iy] += alpha * temp; + iy += inc_y; + a_ptr += lda; + } + return (0); +} diff --git a/kernel/arm64/scal.S b/kernel/arm64/scal.S index 09c41cda..33400b63 100644 --- a/kernel/arm64/scal.S +++ b/kernel/arm64/scal.S @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define X_COPY x5 /* X vector address */ #define INC_X x4 /* X stride */ #define I x1 /* loop variable */ - +#define FLAG x9 /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -168,9 +168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, xzr ble .Lscal_kernel_L999 + ldr FLAG, [sp] + cmp FLAG, #1 + beq .Lscal_kernel_nansafe + fcmp DA, #0.0 beq .Lscal_kernel_zero +.Lscal_kernel_nansafe: cmp INC_X, #1 bne .Lscal_kernel_S_BEGIN diff --git a/kernel/arm64/sgemm_small_kernel_nn_sve.c b/kernel/arm64/sgemm_small_kernel_nn_sve.c new file mode 100644 index 00000000..8ea9cf5a --- /dev/null +++ b/kernel/arm64/sgemm_small_kernel_nn_sve.c @@ -0,0 +1,687 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr; +#define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; +#define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] +#define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR4(m, n) \ + float32x4_t result##m##n = vdupq_n_f32(0.0); +#define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; +#define BROADCAST_LOAD_A4(m, offset_k) \ + float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define VECTOR_LOAD_B_K4(n, offset_k) \ + float32x4_t b##k##n##_k##offset_k = vld1q_f32(&B_ELEMENT_K(n, offset_k)); +#define TRANSPOSE_B4_K4( \ + n0, n1, n2, n3, offset_k0, offset_k1, offset_k2, offset_k3) \ + float32x4_t b##t##n0##_k##offset_k0 = \ + vzip1q_f32(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ + float32x4_t b##t##n0##_k##offset_k1 = \ + vzip2q_f32(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ + float32x4_t b##t##n0##_k##offset_k2 = \ + vzip1q_f32(b##k##n2##_k##offset_k0, b##k##n3##_k##offset_k0); \ + float32x4_t b##t##n0##_k##offset_k3 = \ + vzip2q_f32(b##k##n2##_k##offset_k0, b##k##n3##_k##offset_k0); \ + float32x4_t b##n0##_k##offset_k0 = vreinterpretq_f32_f64( \ + vzip1q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k0), \ + vreinterpretq_f64_f32(b##t##n0##_k##offset_k2))); \ + float32x4_t b##n0##_k##offset_k1 = vreinterpretq_f32_f64( \ + vzip2q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k0), \ + vreinterpretq_f64_f32(b##t##n0##_k##offset_k2))); \ + float32x4_t b##n0##_k##offset_k2 = vreinterpretq_f32_f64( \ + vzip1q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k1), \ + vreinterpretq_f64_f32(b##t##n0##_k##offset_k3))); \ + float32x4_t b##n0##_k##offset_k3 = vreinterpretq_f32_f64( \ + vzip2q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k1), \ + vreinterpretq_f64_f32(b##t##n0##_k##offset_k3))); + +#define SCALE_B4_K4(n0, offset_k0, offset_k1, offset_k2, offset_k3) \ + svfloat32_t b##s##n0##_k##offset_k0 = svdup_neonq_f32(b##n0##_k##offset_k0); \ + svfloat32_t b##s##n0##_k##offset_k1 = svdup_neonq_f32(b##n0##_k##offset_k1); \ + svfloat32_t b##s##n0##_k##offset_k2 = svdup_neonq_f32(b##n0##_k##offset_k2); \ + svfloat32_t b##s##n0##_k##offset_k3 = svdup_neonq_f32(b##n0##_k##offset_k3); +#define GATHER_LOAD_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); +#define VECTOR_UNPACK_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); +#define VECTOR_PACK_B4(n, offset_k) \ + vst1q_f32(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); +#define PACK_B0(n, offset_k) \ + PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); +#define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ + result##m##n = \ + vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define SCATTER_STORE4(m, n) \ + result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ + C_ELEMENT(m, n + 0) = vgetq_lane_f32(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = vgetq_lane_f32(result##m##n, 1); \ + C_ELEMENT(m, n + 2) = vgetq_lane_f32(result##m##n, 2); \ + C_ELEMENT(m, n + 3) = vgetq_lane_f32(result##m##n, 3); +#else +#define SCATTER_STORE4(m, n) \ + result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ + C_ELEMENT(m, n + 0) = \ + C_ELEMENT(m, n + 0) * beta + vgetq_lane_f32(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = \ + C_ELEMENT(m, n + 1) * beta + vgetq_lane_f32(result##m##n, 1); \ + C_ELEMENT(m, n + 2) = \ + C_ELEMENT(m, n + 2) * beta + vgetq_lane_f32(result##m##n, 2); \ + C_ELEMENT(m, n + 3) = \ + C_ELEMENT(m, n + 3) * beta + vgetq_lane_f32(result##m##n, 3); +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define QUADWORD_LOAD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); +#define PACK_B(n, offset_k) \ + svst1(pg_first, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); +#define VECTOR_PACK_B(n, offset_k) \ + svst1(pg_true, &PACK_ELEMENT_K(n* v_size, offset_k), b##s##n##_k##offset_k); +#define QUADWORD_PACK_B(n, offset_k) \ + svst1(pg_quad, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); +#define UNPACK_VECTOR_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = \ + svld1(pg_true, &PACK_ELEMENT_K(n * v_size, offset_k)); +#define UNPACK_BROADCAST_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = svdup_f32(PACK_ELEMENT_K(n, offset_k)); +#define UNPACK_QUADWORD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &PACK_ELEMENT_K(n, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b32(); + const svbool_t pg_quad = svwhilelt_b32(0, 4); + const svbool_t pg_first = svwhilelt_b32(0, 1); + const svfloat32_t alpha_vec = svdup_f32(alpha); +#ifndef B0 + const svfloat32_t beta_vec = svdup_f32(beta); +#endif + const BLASLONG n4 = N & -4; + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + const BLASLONG k4 = K & -4; + + const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + FLOAT* packed_b = + (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; + + FLOAT* b_offset = B; + FLOAT* a_offset = A; + FLOAT* c_offset = C; + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, 1); + CREATE_C_POINTER(2, 2); + CREATE_C_POINTER(3, 3); + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_b != NULL)) { + if (i == 0) { + for (; k < k4; k += 4) { + + VECTOR_LOAD_B_K4(0, 0); + VECTOR_LOAD_B_K4(1, 0); + VECTOR_LOAD_B_K4(2, 0); + VECTOR_LOAD_B_K4(3, 0); + TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); + SCALE_B4_K4(0, 0, 1, 2, 3); + VECTOR_PACK_B4(0, 0); + VECTOR_PACK_B4(0, 1); + VECTOR_PACK_B4(0, 2); + VECTOR_PACK_B4(0, 3); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + PACK_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + PACK_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + PACK_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + PACK_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } else { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } + } else { + for (; k < k4; k += 4) { + + VECTOR_LOAD_B_K4(0, 0); + VECTOR_LOAD_B_K4(1, 0); + VECTOR_LOAD_B_K4(2, 0); + VECTOR_LOAD_B_K4(3, 0); + TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); + SCALE_B4_K4(0, 0, 1, 2, 3); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + INCR_C_POINTER(2, v_size2); + INCR_C_POINTER(3, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } else { + for (; k < k4; k += 4) { + + VECTOR_LOAD_B_K4(0, 0); + VECTOR_LOAD_B_K4(1, 0); + VECTOR_LOAD_B_K4(2, 0); + VECTOR_LOAD_B_K4(3, 0); + TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); + SCALE_B4_K4(0, 0, 1, 2, 3); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, v_size); + INCR_C_POINTER(1, v_size); + INCR_C_POINTER(2, v_size); + INCR_C_POINTER(3, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } else { + for (; k < k4; k += 4) { + + VECTOR_LOAD_B_K4(0, 0); + VECTOR_LOAD_B_K4(1, 0); + VECTOR_LOAD_B_K4(2, 0); + VECTOR_LOAD_B_K4(3, 0); + TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); + SCALE_B4_K4(0, 0, 1, 2, 3); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_tail, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_tail, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_tail, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); + } + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 0); + INCR_C_POINTER(1, 0); + INCR_C_POINTER(2, 0); + INCR_C_POINTER(3, 0); + } + + UPDATE_B_POINTER(4); + RESET_A_POINTER(); + UPDATE_C_POINTER(4); + } + for (; j < N; j++) { + + CREATE_C_POINTER(0, 0); + CREATE_B_POINTER(0, 0); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 0); + } + + UPDATE_B_POINTER(1); + RESET_A_POINTER(); + UPDATE_C_POINTER(1); + } + + if (pack_b) + free(packed_b); + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/sgemm_small_kernel_nt_sve.c b/kernel/arm64/sgemm_small_kernel_nt_sve.c new file mode 100644 index 00000000..ac7e067c --- /dev/null +++ b/kernel/arm64/sgemm_small_kernel_nt_sve.c @@ -0,0 +1,483 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr; +#define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; +#define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] +#define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR4(m, n) \ + float32x4_t result##m##n = vdupq_n_f32(0.0); +#define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; +#define BROADCAST_LOAD_A4(m, offset_k) \ + float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define VECTOR_LOAD_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vld1q_f32(&B_ELEMENT_K(n, offset_k)); +#define GATHER_LOAD_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); +#define VECTOR_UNPACK_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); +#define VECTOR_PACK_B4(n, offset_k) \ + vst1q_f32(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); +#define PACK_B0(n, offset_k) \ + PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); +#define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ + result##m##n = \ + vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define SCATTER_STORE4(m, n) \ + result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ + C_ELEMENT(m, n + 0) = vgetq_lane_f32(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = vgetq_lane_f32(result##m##n, 1); \ + C_ELEMENT(m, n + 2) = vgetq_lane_f32(result##m##n, 2); \ + C_ELEMENT(m, n + 3) = vgetq_lane_f32(result##m##n, 3); +#else +#define SCATTER_STORE4(m, n) \ + result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ + C_ELEMENT(m, n + 0) = \ + C_ELEMENT(m, n + 0) * beta + vgetq_lane_f32(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = \ + C_ELEMENT(m, n + 1) * beta + vgetq_lane_f32(result##m##n, 1); \ + C_ELEMENT(m, n + 2) = \ + C_ELEMENT(m, n + 2) * beta + vgetq_lane_f32(result##m##n, 2); \ + C_ELEMENT(m, n + 3) = \ + C_ELEMENT(m, n + 3) * beta + vgetq_lane_f32(result##m##n, 3); +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define QUADWORD_LOAD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); +#define PACK_B(n, offset_k) \ + svst1(pg_first, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); +#define VECTOR_PACK_B(n, offset_k) \ + svst1(pg_true, &PACK_ELEMENT_K(n* v_size, offset_k), b##s##n##_k##offset_k); +#define QUADWORD_PACK_B(n, offset_k) \ + svst1(pg_quad, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); +#define UNPACK_VECTOR_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = \ + svld1(pg_true, &PACK_ELEMENT_K(n * v_size, offset_k)); +#define UNPACK_BROADCAST_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = svdup_f32(PACK_ELEMENT_K(n, offset_k)); +#define UNPACK_QUADWORD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &PACK_ELEMENT_K(n, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b32(); + const svbool_t pg_quad = svwhilelt_b32(0, 4); + const svbool_t pg_first = svwhilelt_b32(0, 1); + const svfloat32_t alpha_vec = svdup_f32(alpha); +#ifndef B0 + const svfloat32_t beta_vec = svdup_f32(beta); +#endif + const BLASLONG n4 = N & -4; + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + + const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + FLOAT* packed_b = + (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; + + FLOAT* b_offset = B; + FLOAT* a_offset = A; + FLOAT* c_offset = C; + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, 1); + CREATE_C_POINTER(2, 2); + CREATE_C_POINTER(3, 3); + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_b != NULL)) { + if (i == 0) { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + QUADWORD_PACK_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } else { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + INCR_C_POINTER(2, v_size2); + INCR_C_POINTER(3, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, v_size); + INCR_C_POINTER(1, v_size); + INCR_C_POINTER(2, v_size); + INCR_C_POINTER(3, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 0); + INCR_C_POINTER(1, 0); + INCR_C_POINTER(2, 0); + INCR_C_POINTER(3, 0); + } + + UPDATE_B_POINTER(4); + RESET_A_POINTER(); + UPDATE_C_POINTER(4); + } + for (; j < N; j++) { + + CREATE_C_POINTER(0, 0); + CREATE_B_POINTER(0, 0); + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(0); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 0); + } + + UPDATE_B_POINTER(1); + RESET_A_POINTER(); + UPDATE_C_POINTER(1); + } + + if (pack_b) + free(packed_b); + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/sgemm_small_kernel_tn_sve.c b/kernel/arm64/sgemm_small_kernel_tn_sve.c new file mode 100644 index 00000000..11464095 --- /dev/null +++ b/kernel/arm64/sgemm_small_kernel_tn_sve.c @@ -0,0 +1,719 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; +#define UPDATE_C_POINTER(scale) c_offset += scale; +#define C_ELEMENT(m, n) \ + *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] +#define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR4(m, n) \ + float32x4_t result##m##n = vdupq_n_f32(0.0); +#define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; +#define BROADCAST_LOAD_A4(m, offset_k) \ + float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define GATHER_LOAD_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); +#define VECTOR_UNPACK_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); +#define PACK_B0(n, offset_k) \ + PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); +#define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ + result##m##n = \ + vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define SCATTER_STORE4(m, n) \ + result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ + C_ELEMENT(m, n + 0) = vgetq_lane_f32(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = vgetq_lane_f32(result##m##n, 1); \ + C_ELEMENT(m, n + 2) = vgetq_lane_f32(result##m##n, 2); \ + C_ELEMENT(m, n + 3) = vgetq_lane_f32(result##m##n, 3); +#else +#define SCATTER_STORE4(m, n) \ + result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ + C_ELEMENT(m, n + 0) = \ + C_ELEMENT(m, n + 0) * beta + vgetq_lane_f32(result##m##n, 0); \ + C_ELEMENT(m, n + 1) = \ + C_ELEMENT(m, n + 1) * beta + vgetq_lane_f32(result##m##n, 1); \ + C_ELEMENT(m, n + 2) = \ + C_ELEMENT(m, n + 2) * beta + vgetq_lane_f32(result##m##n, 2); \ + C_ELEMENT(m, n + 3) = \ + C_ELEMENT(m, n + 3) * beta + vgetq_lane_f32(result##m##n, 3); +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define GATHER_LOAD_A(pg, m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = \ + svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); +#define PACK_A(m, offset_k) \ + svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define VECTOR_PACK_A(m, offset_k) \ + svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); +#define QUADWORD_PACK_A(m, offset_k) \ + svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define UNPACK_VECTOR_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = \ + svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); +#define UNPACK_BROADCAST_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svdup_f32(PACK_ELEMENT_K(m, offset_k)); +#define UNPACK_QUADWORD_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = \ + svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b32(); + const svbool_t pg_quad = svwhilelt_b32(0, 4); + const svbool_t pg_first = svwhilelt_b32(0, 1); + const svfloat32_t alpha_vec = svdup_f32(alpha); +#ifndef B0 + const svfloat32_t beta_vec = svdup_f32(beta); +#endif + const svuint32_t lda_vec = svindex_u32(0LL, lda); + + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + const BLASLONG n8 = N & -8; + const BLASLONG n4 = N & -4; + + const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + FLOAT* packed_a = + (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; + + FLOAT* a_offset = A; + FLOAT* b_offset = B; + FLOAT* c_offset = C; + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, v_size); + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + + BLASLONG j = 0; + for (; j < n8; j += 8) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + DECLARE_RESULT_VECTOR(1, 4); + DECLARE_RESULT_VECTOR(1, 5); + DECLARE_RESULT_VECTOR(1, 6); + DECLARE_RESULT_VECTOR(1, 7); + + if (LIKELY(packed_a != NULL)) { + if (j == 0) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + VECTOR_PACK_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + VECTOR_PACK_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); + } + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 0, 4); + VECTOR_STORE(pg_true, 0, 5); + VECTOR_STORE(pg_true, 0, 6); + VECTOR_STORE(pg_true, 0, 7); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + VECTOR_STORE(pg_true, 1, 4); + VECTOR_STORE(pg_true, 1, 5); + VECTOR_STORE(pg_true, 1, 6); + VECTOR_STORE(pg_true, 1, 7); + INCR_C_POINTER(0, 8); + INCR_C_POINTER(1, 8); + } + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, 4); + INCR_C_POINTER(1, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, 1); + INCR_C_POINTER(1, 1); + } + + UPDATE_A_POINTER(v_size2); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n8; j += 8) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 0, 4); + VECTOR_STORE(pg_true, 0, 5); + VECTOR_STORE(pg_true, 0, 6); + VECTOR_STORE(pg_true, 0, 7); + INCR_C_POINTER(0, 8); + } + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(v_size); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n8; j += 8) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 7, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + VECTOR_STORE(pg_tail, 0, 4); + VECTOR_STORE(pg_tail, 0, 5); + VECTOR_STORE(pg_tail, 0, 6); + VECTOR_STORE(pg_tail, 0, 7); + INCR_C_POINTER(0, 8); + } + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(0); + RESET_B_POINTER(); + UPDATE_C_POINTER(0); + } + + if (pack_a) + free(packed_a); + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/sgemm_small_kernel_tt_sve.c b/kernel/arm64/sgemm_small_kernel_tt_sve.c new file mode 100644 index 00000000..731c9861 --- /dev/null +++ b/kernel/arm64/sgemm_small_kernel_tt_sve.c @@ -0,0 +1,678 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include +#if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ + __has_include() +#include +#else +#define svdup_neonq_f32(fixed_reg) \ + ({ \ + svfloat32_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#define svdup_neonq_f64(fixed_reg) \ + ({ \ + svfloat64_t scalable_reg; \ + asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ + scalable_reg; \ + }) +#endif + +#define RESET_A_POINTER() a_offset = A; + +#define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; +#define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; +#define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) +#define A_ELEMENT(m) A_ELEMENT_K(m, 0) + +#define RESET_B_POINTER() b_offset = B; + +#define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; +#define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; +#define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) +#define B_ELEMENT(n) B_ELEMENT_K(n, 0) + +#define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; +#define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; +#define UPDATE_C_POINTER(scale) c_offset += scale; +#define C_ELEMENT(m, n) \ + *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] + +// #undef C_ELEMENT +// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] + +#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] +#define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) + +// ASIMD +#define DECLARE_RESULT_VECTOR4(m, n) \ + float32x4_t result##m##n = vdupq_n_f32(0.0); +#define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; +#define BROADCAST_LOAD_A4(m, offset_k) \ + float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); +#define LOAD_A1(m, offset_k) \ + float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); +#define VECTOR_LOAD_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vld1q_f32(&B_ELEMENT_K(n, offset_k)); +#define GATHER_LOAD_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ + b##n##_k##offset_k = \ + vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); +#define VECTOR_UNPACK_B4(n, offset_k) \ + float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); +#define VECTOR_PACK_B4(n, offset_k) \ + vst1q_f32(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); +#define PACK_B0(n, offset_k) \ + PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); +#define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ + result##m##n = \ + vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); +#define UPDATE_RESULT(m, n, offset_k) \ + result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; +#ifdef B0 +#define VECTOR_STORE4(m, n) \ + vst1q_f32(&C_ELEMENT(m, n), vmulq_f32(result##m##n, vdupq_n_f32(alpha))); +#define STORE(m, n) C_ELEMENT(m, n) = alpha * result##m##n; +#else +#define VECTOR_STORE4(m, n) \ + result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ + result##m##n = \ + vfmaq_f32(result##m##n, vld1q_f32(&C_ELEMENT(m, n)), vdupq_n_f32(beta)); \ + vst1q_f32(&C_ELEMENT(m, n), result##m##n); +#define STORE(m, n) \ + C_ELEMENT(m, n) = C_ELEMENT(m, n) * beta + alpha * result##m##n; +#endif + +// SVE +#define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); +#define BROADCAST_LOAD_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); +#define BROADCAST_LOAD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); +#define VECTOR_LOAD_A(pg, m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); +#define QUADWORD_LOAD_B(n, offset_k) \ + svfloat32_t b##s##n##_k##offset_k = \ + svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); +#define GATHER_LOAD_A(pg, m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = \ + svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); +#define PACK_A(m, offset_k) \ + svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define VECTOR_PACK_A(m, offset_k) \ + svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); +#define QUADWORD_PACK_A(m, offset_k) \ + svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); +#define UNPACK_VECTOR_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = \ + svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); +#define UNPACK_BROADCAST_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = svdup_f32(PACK_ELEMENT_K(m, offset_k)); +#define UNPACK_QUADWORD_A(m, offset_k) \ + svfloat32_t a##s##m##_k##offset_k = \ + svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); +#define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ + result##m##n = \ + svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); +#define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ + result##m##n = svmla_lane( \ + result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); +#ifdef B0 +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#else +#define VECTOR_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = \ + svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ + svst1(pg, &C_ELEMENT(m, n), result##m##n); +#define SCATTER_STORE(pg, m, n) \ + result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ + result##m##n = svmla_m(pg, \ + result##m##n, \ + svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ + beta_vec); \ + svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); +#endif + +#ifndef LIKELY +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif +#endif + +#ifdef B0 +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT* C, + BLASLONG ldc) +#else +int +CNAME(BLASLONG M, + BLASLONG N, + BLASLONG K, + IFLOAT* A, + BLASLONG lda, + FLOAT alpha, + IFLOAT* B, + BLASLONG ldb, + FLOAT beta, + FLOAT* C, + BLASLONG ldc) +#endif +{ + const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; + const svbool_t pg_true = svptrue_b32(); + const svbool_t pg_quad = svwhilelt_b32(0, 4); + const svbool_t pg_first = svwhilelt_b32(0, 1); + const svfloat32_t alpha_vec = svdup_f32(alpha); +#ifndef B0 + const svfloat32_t beta_vec = svdup_f32(beta); +#endif + const svuint32_t lda_vec = svindex_u32(0LL, lda); + + const BLASLONG v_m2 = M & -v_size2; + const BLASLONG v_m1 = M & -v_size; + const BLASLONG n8 = N & -8; + const BLASLONG n4 = N & -4; + + const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + FLOAT* packed_a = + (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; + + FLOAT* a_offset = A; + FLOAT* b_offset = B; + FLOAT* c_offset = C; + + BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, v_size); + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + + BLASLONG j = 0; + for (; j < n8; j += 8) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + DECLARE_RESULT_VECTOR(1, 4); + DECLARE_RESULT_VECTOR(1, 5); + DECLARE_RESULT_VECTOR(1, 6); + DECLARE_RESULT_VECTOR(1, 7); + + if (LIKELY(packed_a != NULL)) { + if (j == 0) { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + VECTOR_PACK_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + QUADWORD_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); + GATHER_LOAD_A(pg_true, 1, 0); + VECTOR_PACK_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + QUADWORD_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); + } + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + QUADWORD_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 0, 4); + VECTOR_STORE(pg_true, 0, 5); + VECTOR_STORE(pg_true, 0, 6); + VECTOR_STORE(pg_true, 0, 7); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + VECTOR_STORE(pg_true, 1, 4); + VECTOR_STORE(pg_true, 1, 5); + VECTOR_STORE(pg_true, 1, 6); + VECTOR_STORE(pg_true, 1, 7); + INCR_C_POINTER(0, 8); + INCR_C_POINTER(1, 8); + } + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, 4); + INCR_C_POINTER(1, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, 1); + INCR_C_POINTER(1, 1); + } + + UPDATE_A_POINTER(v_size2); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n8; j += 8) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + QUADWORD_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 0, 4); + VECTOR_STORE(pg_true, 0, 5); + VECTOR_STORE(pg_true, 0, 6); + VECTOR_STORE(pg_true, 0, 7); + INCR_C_POINTER(0, 8); + } + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(v_size); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size); + } + for (; i < M; i += v_size) { + const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); + CREATE_C_POINTER(0, 0); + CREATE_A_POINTER(0, 0); + + BLASLONG j = 0; + for (; j < n8; j += 8) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + QUADWORD_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + VECTOR_STORE(pg_tail, 0, 4); + VECTOR_STORE(pg_tail, 0, 5); + VECTOR_STORE(pg_tail, 0, 6); + VECTOR_STORE(pg_tail, 0, 7); + INCR_C_POINTER(0, 8); + } + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + INCR_C_POINTER(0, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + INCR_C_POINTER(0, 1); + } + + UPDATE_A_POINTER(0); + RESET_B_POINTER(); + UPDATE_C_POINTER(0); + } + + if (pack_a) + free(packed_a); + + return 0; +} \ No newline at end of file diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 6f65e5cf..d4839241 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -292,7 +292,10 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31" ); cf=OPENBLAS_MAKE_COMPLEX_FLOAT(dotr, doti); diff --git a/kernel/generic/laswp_ncopy_6.c b/kernel/generic/laswp_ncopy_6.c new file mode 100644 index 00000000..85a17a09 --- /dev/null +++ b/kernel/generic/laswp_ncopy_6.c @@ -0,0 +1,276 @@ + +/*********************************************************************/ +/* Copyright 2009, 2010, 2024 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 4 + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip; + blasint *piv; + FLOAT *dx1, *dy1; + FLOAT *dx2, *dy2; + FLOAT *dx3, *dy3; + FLOAT *dx4, *dy4; + FLOAT *dx5, *dy5; + FLOAT *dx6, *dy6; + FLOAT atemp1, btemp1; + FLOAT atemp2, btemp2; + FLOAT atemp3, btemp3; + FLOAT atemp4, btemp4; + FLOAT atemp5, btemp5; + FLOAT atemp6, btemp6; + + a--; + ipiv += k1 - 1; + + if (n <= 0) return 0; + if (k1 > k2) return 0; + + j = (n / 6); + if (j > 0) { + do { + piv = ipiv; + i = k1; + + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + dx5 = a + i + lda * 4; + dy5 = a + ip + lda * 4; + dx6 = a + i + lda * 5; + dy6 = a + ip + lda * 5; + +#ifdef __GNUC__ + __builtin_prefetch(dx1 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx2 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx3 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx4 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx5 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx6 + PREFETCHSIZE, 0, 1); +#endif + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + atemp3 = *dx3; + btemp3 = *dy3; + atemp4 = *dx4; + btemp4 = *dy4; + + atemp5 = *dx5; + btemp5 = *dy5; + atemp6 = *dx6; + btemp6 = *dy6; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *dy5 = atemp5; + *dy6 = atemp6; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + *(buffer + 4) = btemp5; + *(buffer + 5) = btemp6; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + *(buffer + 4) = atemp5; + *(buffer + 5) = atemp6; + } + + buffer += 6; + + i++; + } while (i <= k2); + + a += 6 * lda; + j --; + } while (j > 0); + } + + if ((n % 6) & 4) { + piv = ipiv; + + ip = *piv; + piv ++; + + dx1 = a + k1; + dy1 = a + ip; + dx2 = a + k1 + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + k1 + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + k1 + lda * 3; + dy4 = a + ip + lda * 3; + + i = k1; + + do { + atemp1 = *dx1; + atemp2 = *dx2; + atemp3 = *dx3; + atemp4 = *dx4; + + btemp1 = *dy1; + btemp2 = *dy2; + btemp3 = *dy3; + btemp4 = *dy4; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + } + + ip = *piv; + piv ++; + + i++; + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + + buffer += 4; + + } while (i <= k2); + + a += 4 * lda; + } + + if ((n % 6) & 2) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda; + dy2 = a + ip + lda; + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + } + + buffer += 2; + + i++; + } while (i <= k2); + + a += 2 * lda; + } + + + if ((n % 6) & 1) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + atemp1 = *dx1; + btemp1 = *dy1; + + if (ip != i) { + *dy1 = atemp1; + *buffer = btemp1; + } else { + *buffer = atemp1; + } + + buffer ++; + + i++; + } while (i <= k2); + + // a += lda; + } + + return 0; +} \ No newline at end of file diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c index ca730e1e..3a3e2d5b 100644 --- a/kernel/generic/symm_lcopy_6.c +++ b/kernel/generic/symm_lcopy_6.c @@ -41,98 +41,141 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; - - FLOAT data01, data02, data03, data04; - FLOAT *ao1, *ao2, *ao3, *ao4; + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + + b += 6; + + offset --; + i --; + } + + posX += 6; + js --; + } - js = (n >> 2); - while (js > 0){ + if ((n - n/6) & 4) { + offset = posX - posY; - offset = posX - posY; + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; - if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; - if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; - if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; - if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - data03 = *(ao3 + 0); - data04 = *(ao4 + 0); + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; - if (offset > 0) ao1 += lda; else ao1 ++; - if (offset > -1) ao2 += lda; else ao2 ++; - if (offset > -2) ao3 += lda; else ao3 ++; - if (offset > -3) ao4 += lda; else ao4 ++; + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; + b += 4; - b += 4; + offset --; + i --; + } - offset --; - i --; + posX += 4; } - posX += 4; - js --; - } + if ((n - n/6) & 2) { - if (n & 2) { + offset = posX - posY; - offset = posX - posY; + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; - if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; - if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; - if (offset > 0) ao1 += lda; else ao1 ++; - if (offset > -1) ao2 += lda; else ao2 ++; + b[ 0] = data01; + b[ 1] = data02; - b[ 0] = data01; - b[ 1] = data02; + b += 2; - b += 2; + offset --; + i --; + } - offset --; - i --; + posX += 2; } - posX += 2; - } - - if (n & 1) { + if ((n - n/6) & 1) { - offset = posX - posY; + offset = posX - posY; - if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; - i = m; + i = m; - while (i > 0) { - data01 = *(ao1 + 0); + while (i > 0) { + data01 = *(ao1 + 0); - if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > 0) ao1 += lda; else ao1 ++; - b[ 0] = data01; + b[ 0] = data01; - b ++; + b ++; - offset --; - i --; + offset --; + i --; + } } - } - return 0; + return 0; } diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c index 6dbb861e..a83d937d 100644 --- a/kernel/generic/symm_ucopy_6.c +++ b/kernel/generic/symm_ucopy_6.c @@ -41,96 +41,140 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + + b += 6; + + offset --; + i --; + } + + posX += 6; + js --; + } - FLOAT data01, data02, data03, data04; - FLOAT *ao1, *ao2, *ao3, *ao4; + if ((n - n/6) & 4) { - js = (n >> 2); - while (js > 0){ + offset = posX - posY; - offset = posX - posY; + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; - if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; - if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; - if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; - if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - data03 = *(ao3 + 0); - data04 = *(ao4 + 0); + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; - if (offset > 0) ao1 ++; else ao1 += lda; - if (offset > -1) ao2 ++; else ao2 += lda; - if (offset > -2) ao3 ++; else ao3 += lda; - if (offset > -3) ao4 ++; else ao4 += lda; + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; + b += 4; - b += 4; + offset --; + i --; + } - offset --; - i --; + posX += 4; } - posX += 4; - js --; - } + if ((n - n/6) & 2) { + offset = posX - posY; - if (n & 2) { - offset = posX - posY; + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; - if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; - if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; - if (offset > 0) ao1 ++; else ao1 += lda; - if (offset > -1) ao2 ++; else ao2 += lda; + b[ 0] = data01; + b[ 1] = data02; - b[ 0] = data01; - b[ 1] = data02; + b += 2; - b += 2; + offset --; + i --; + } - offset --; - i --; + posX += 2; } - posX += 2; - } - - if (n & 1) { - offset = posX - posY; + if ((n - n/6) & 1) { + offset = posX - posY; - if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; - i = m; + i = m; - while (i > 0) { - data01 = *(ao1 + 0); + while (i > 0) { + data01 = *(ao1 + 0); - if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > 0) ao1 ++; else ao1 += lda; - b[ 0] = data01; + b[ 0] = data01; - b ++; + b ++; - offset --; - i --; + offset --; + i --; + } } - } - return 0; + return 0; } diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c index 0dcfb965..999f0d36 100644 --- a/kernel/generic/trmm_lncopy_6.c +++ b/kernel/generic/trmm_lncopy_6.c @@ -41,444 +41,510 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X; - - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *ao1, *ao2, *ao3, *ao4; - - js = (n >> 2); - - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - } - - i = (m >> 2); - if (i > 0) { - do { - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b[ 4] = data02; - b[ 5] = data06; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = data11; - b[11] = data15; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = data16; - - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - - } else - if (X < posY) { - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - b += 16; - - } else { + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } + + i = (m / 6); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + + } else if (X < posY) { + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + b += 36; + + } else { #ifdef UNIT - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data12 = *(ao3 + 3); - - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = data02; - b[ 5] = ONE; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = ONE; - b[11] = ZERO; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = data02; - b[ 5] = data06; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = data11; - b[11] = ZERO; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = data16; + b[ 0] = *(ao1 + 0); #endif - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } - - X += 4; - i --; - } while (i > 0); - } - - i = (m & 3); - if (i) { - - if (X > posY) { - - if (m & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao2 + 0); - data04 = *(ao2 + 1); - data05 = *(ao3 + 0); - data06 = *(ao3 + 1); - data07 = *(ao4 + 0); - data08 = *(ao4 + 1); - - b[ 0] = data01; - b[ 1] = data03; - b[ 2] = data05; - b[ 3] = data07; - b[ 4] = data02; - b[ 5] = data04; - b[ 6] = data06; - b[ 7] = data08; - - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (m & 1) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - data03 = *(ao3 + 0); - data04 = *(ao4 + 0); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X < posY) { - if (m & 2) { - ao1 += 2 * lda; - ao2 += 2 * lda; - - b += 8; - } - - if (m & 1) { - ao1 += lda; - b += 4; - } - - } else { + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + + b[ 6] = *(ao1 + 1); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + + b[12] = *(ao1 + 2); + b[13] = *(ao2 + 2); +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(ao3 + 2); +#endif + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + + b[18] = *(ao1 + 3); + b[19] = *(ao2 + 3); + b[20] = *(ao3 + 3); +#ifdef UNIT + b[21] = ONE; +#else + b[21] = *(ao4 + 3); +#endif + b[22] = ZERO; + b[23] = ZERO; + + b[24] = *(ao1 + 4); + b[25] = *(ao2 + 4); + b[26] = *(ao3 + 4); + b[27] = *(ao4 + 4); +#ifdef UNIT + b[28] = ONE; +#else + b[28] = *(ao5 + 4); +#endif + b[29] = ZERO; + + b[30] = *(ao1 + 5); + b[31] = *(ao2 + 5); + b[32] = *(ao3 + 5); + b[33] = *(ao4 + 5); + b[34] = *(ao5 + 5); +#ifdef UNIT + b[35] = ONE; +#else + b[35] = *(ao6 + 5); +#endif + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } + + X += 6; + i --; + } while (i > 0); + } + + i = (m % 6); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + + } else if (X < posY) { + + b += 6 * i; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + + if (i >= 2) { + b[ 0] = *(ao1 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 3) { + b[ 0] = *(ao1 + 2); + b[ 1] = *(ao2 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 4) { + b[ 0] = *(ao1 + 3); + b[ 1] = *(ao2 + 3); + b[ 2] = *(ao3 + 3); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 5) { + b[ 0] = *(ao1 + 4); + b[ 1] = *(ao2 + 4); + b[ 2] = *(ao3 + 4); + b[ 3] = *(ao4 + 4); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = ZERO; + b += 6; + } + } + } + + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 2; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(ao1 + 1); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(ao2 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(ao1 + 2); + b[ 9] = *(ao2 + 2); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(ao3 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(ao1 + 3); + b[ 13] = *(ao2 + 3); + b[ 14] = *(ao3 + 3); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + X += 4; + i -= 2; + continue; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X < posY) { + /* ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; */ + b += 4 * i; + } else { #ifdef UNIT - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data15 = *(ao4 + 2); - } - - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = ONE; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ONE; - b[ 3] = data15; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data06 = *(ao2 + 1); - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data11 = *(ao3 + 2); - data15 = *(ao4 + 2); - } - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = data06; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = data11; - b[ 3] = data15; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data02; - b[ 3] = data06; - - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X < posY) { - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } else { + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + + posY += 4; + } + + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { #ifdef UNIT - data02 = *(ao1 + 1); + data02 = *(ao1 + 1); - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = data02; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = data02; - b[ 3] = data06; + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; #endif - ao1 += 2; - ao2 += 2; - - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - b[ 0] = data01; - b[ 1] = data02; - - ao1 += 1; - ao2 += 1; - b += 2; - } else - if (X < posY) { - ao1 += lda; - b += 2; - } else { + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else if (X < posY) { + ao1 += lda; + b += 2; + } else { #ifdef UNIT - data05 = *(ao2 + 0); + data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = data05; + b[ 0] = ONE; + b[ 1] = data05; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); - b[ 0] = data01; - b[ 1] = data05; + b[ 0] = data01; + b[ 1] = data05; #endif - b += 2; - } - } - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - } - - i = m; - if (i > 0) { - do { - if (X > posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - b += 1; - ao1 += 1; - } else - if (X < posY) { - b += 1; - ao1 += lda; - } else { + b += 2; + } + } + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else if (X < posY) { + b += 1; + ao1 += lda; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - b += 1; - ao1 += 1; - } + b += 1; + ao1 += 1; + } - X ++; - i --; - } while (i > 0); - } + X ++; + i --; + } while (i > 0); + } - posY += 1; - } + posY += 1; + } - return 0; + return 0; } diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c index 66a7325b..7c224503 100644 --- a/kernel/generic/trmm_ltcopy_6.c +++ b/kernel/generic/trmm_ltcopy_6.c @@ -41,448 +41,511 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X; - - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *ao1, *ao2, *ao3, *ao4; - - js = (n >> 2); - - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - } - - i = (m >> 2); - if (i > 0) { - do { - if (X > posY) { - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - - } else - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = data11; - b[11] = data12; - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = data16; - - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - b += 16; - - } else { + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } + + i = (m / 6); + if (i > 0) { + do { + if (X > posY) { + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + + } else if (X < posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + b += 6; + } + + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = *(ao2 + 2); + b[ 9] = *(ao2 + 3); + b[10] = *(ao2 + 4); + b[11] = *(ao2 + 5); + + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(ao3 + 2); +#endif + b[15] = *(ao3 + 3); + b[16] = *(ao3 + 4); + b[17] = *(ao3 + 5); + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; +#ifdef UNIT + b[21] = ONE; +#else + b[21] = *(ao4 + 3); +#endif + b[22] = *(ao4 + 4); + b[23] = *(ao4 + 5); + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; +#ifdef UNIT + b[28] = ONE; +#else + b[28] = *(ao5 + 4); +#endif + b[29] = *(ao5 + 5); + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; +#ifdef UNIT + b[35] = ONE; +#else + b[35] = *(ao6 + 5); +#endif + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } + + X += 6; + i --; + } while (i > 0); + } + + i = (m % 6); + if (i) { + + if (X > posY) { + + b += 6 * i; + + } else if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + ao2 += lda; + ao3 += lda; + ao4 += lda; + ao5 += lda; + ao6 += lda; + b += 6; + } + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + b += 6; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); + b[ 4] = *(ao2 + 4); + b[ 5] = *(ao2 + 5); + b += 6; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = *(ao3 + 3); + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); + b += 6; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = *(ao4 + 4); + b[ 5] = *(ao4 + 5); + b += 6; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = *(ao5 + 5); + b += 6; + } + } + } + + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } else if (X < posY) { + + for (ii = 0; ii < 2; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + ao1 += lda; + b += 4; + } + + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(ao2 + 1); +#endif + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); + b[ 8] = ZERO; + b[ 9] = ZERO; #ifdef UNIT - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data12 = *(ao3 + 3); - - b[ 0] = ONE; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - b[ 4] = ZERO; - b[ 5] = ONE; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ONE; - b[11] = data12; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ONE; + b[ 10] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = ZERO; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = data11; - b[11] = data12; - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = data16; + b[ 10] = *(ao3 + 2); #endif - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } - - X += 4; - i --; - } while (i > 0); - } - - i = (m & 3); - if (i) { - - if (X > posY) { - - if (m & 2) { - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (m & 1) { - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X < posY) { - if (m & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - ao1 += 2 * lda; - ao2 += 2 * lda; - - b += 8; - } - - if (m & 1) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - ao1 += lda; - b += 4; - } - - } else { + b[ 11] = *(ao3 + 3); + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + X += 4; + i -= 2; + continue; + } + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + /* ao1 += i; + ao2 += i; + ao3 += i; + ao4 += i; */ + b += 4 * i; + } else if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + + // ao1 += lda; + // ao2 += lda; + // ao3 += lda; + // ao4 += lda; + b += 4; + } + } else { #ifdef UNIT - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - if (i >= 2) { - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - } - - if (i >= 3) { - data12 = *(ao3 + 3); - } - - b[ 0] = ONE; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = ONE; - b[ 2] = data07; - b[ 3] = data08; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ONE; - b[ 3] = data12; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - if (i >= 2) { - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - } - - if (i >= 3) { - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - } - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = data06; - b[ 2] = data07; - b[ 3] = data08; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = data11; - b[ 3] = data12; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X > posY) { - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data05; - b[ 3] = data06; - - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } else { + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b += 4; + } + } + posY += 4; + } + + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { #ifdef UNIT - data02 = *(ao1 + 1); + data02 = *(ao1 + 1); - b[ 0] = ONE; - b[ 1] = data02; - b[ 2] = ZERO; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = ZERO; - b[ 3] = data06; + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; #endif - ao1 += 2; - ao2 += 2; - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X > posY) { - ao1 += 1; - ao2 += 1; - - b += 2; - } else - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - - b[ 0] = data01; - b[ 1] = data02; - ao1 += lda; - b += 2; - } else { + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { #ifdef UNIT - data02 = *(ao1 + 1); + data02 = *(ao1 + 1); - b[ 0] = ONE; - b[ 1] = data02; + b[ 0] = ONE; + b[ 1] = data02; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); - b[ 0] = data01; - b[ 1] = data02; + b[ 0] = data01; + b[ 1] = data02; #endif - b += 2; - } - } - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - } - - i = m; - if (i > 0) { - do { - if (X > posY) { - b += 1; - ao1 += 1; - } else - if (X < posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - ao1 += lda; - b += 1; - } else { + b += 2; + } + } + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += 1; - b += 1; - } + ao1 += 1; + b += 1; + } - X ++; - i --; - } while (i > 0); - } + X ++; + i --; + } while (i > 0); + } - posY += 1; - } + posY += 1; + } - return 0; + return 0; } diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c index 4878f3f5..9521cc72 100644 --- a/kernel/generic/trmm_uncopy_6.c +++ b/kernel/generic/trmm_uncopy_6.c @@ -41,745 +41,544 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X, mm; - - FLOAT data01, data02, data03, data04, data05, data06; - FLOAT data07, data08, data09, data10, data11, data12; - FLOAT data13, data14, data15, data16, data17, data18; - FLOAT data19, data20, data21, data22, data23, data24; - FLOAT data25, data26, data27, data28, data29, data30; - FLOAT data31, data32, data33, data34, data35, data36; - - FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; - - //js = (n >> 2); - js = n/6; - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - ao5 = a + posX + (posY + 4) * lda; - ao6 = a + posX + (posY + 5) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - ao5 = a + posY + (posX + 4) * lda; - ao6 = a + posY + (posX + 5) * lda; - } - - i = m/6; - if (i > 0) { - do { - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - data05 = *(ao1 + 4); - data06 = *(ao1 + 5); - - data07 = *(ao2 + 0); - data08 = *(ao2 + 1); - data09 = *(ao2 + 2); - data10 = *(ao2 + 3); - data11 = *(ao2 + 4); - data12 = *(ao2 + 5); - - data13 = *(ao3 + 0); - data14 = *(ao3 + 1); - data15 = *(ao3 + 2); - data16 = *(ao3 + 3); - data17 = *(ao3 + 4); - data18 = *(ao3 + 5); - - data19 = *(ao4 + 0); - data20 = *(ao4 + 1); - data21 = *(ao4 + 2); - data22 = *(ao4 + 3); - data23 = *(ao4 + 4); - data24 = *(ao4 + 5); - - data25 = *(ao5 + 0); - data26 = *(ao5 + 1); - data27 = *(ao5 + 2); - data28 = *(ao5 + 3); - data29 = *(ao5 + 4); - data30 = *(ao5 + 5); - - data31 = *(ao6 + 0); - data32 = *(ao6 + 1); - data33 = *(ao6 + 2); - data34 = *(ao6 + 3); - data35 = *(ao6 + 4); - data36 = *(ao6 + 5); - - b[ 0] = data01; - b[ 1] = data07; - b[ 2] = data13; - b[ 3] = data19; - b[ 4] = data25; - b[ 5] = data31; - - b[ 6] = data02; - b[ 7] = data08; - b[ 8] = data14; - b[ 9] = data20; - b[10] = data26; - b[11] = data32; - - b[12] = data03; - b[13] = data09; - b[14] = data15; - b[15] = data21; - b[16] = data27; - b[17] = data33; - - b[18] = data04; - b[19] = data10; - b[20] = data16; - b[21] = data22; - b[22] = data28; - b[23] = data34; - - b[24] = data05; - b[25] = data11; - b[26] = data17; - b[27] = data23; - b[28] = data29; - b[29] = data35; - - b[30] = data06; - b[31] = data12; - b[32] = data18; - b[33] = data24; - b[34] = data30; - b[35] = data36; - - ao1 += 6; - ao2 += 6; - ao3 += 6; - ao4 += 6; - ao5 += 6; - ao6 += 6; - b += 36; - } else - if (X > posY) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = ZERO; - b[ 5] = ZERO; - b[ 6] = ZERO; - b[ 7] = ZERO; - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ZERO; - b[11] = ZERO; - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ZERO; - b[16] = ZERO; - b[17] = ZERO; - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = ZERO; - b[22] = ZERO; - b[23] = ZERO; - b[24] = ZERO; - b[25] = ZERO; - b[26] = ZERO; - b[27] = ZERO; - b[28] = ZERO; - b[29] = ZERO; - b[30] = ZERO; - b[31] = ZERO; - b[32] = ZERO; - b[33] = ZERO; - b[34] = ZERO; - b[35] = ZERO; - - ao1 += 6 * lda; - ao2 += 6 * lda; - ao3 += 6 * lda; - ao4 += 6 * lda; - ao5 += 6 * lda; - ao6 += 6 * lda; - - b += 36; - } else { - data01 = *(ao1 + 0); - data07 = *(ao2 + 0); - data13 = *(ao3 + 0); - data19 = *(ao4 + 0); - data25 = *(ao5 + 0); - data31 = *(ao6 + 0); - - data08 = *(ao2 + 1); - data14 = *(ao3 + 1); - data20 = *(ao4 + 1); - data26 = *(ao5 + 1); - data32 = *(ao6 + 1); - - data15 = *(ao3 + 2); - data21 = *(ao4 + 2); - data27 = *(ao5 + 2); - data33 = *(ao6 + 2); - - data22 = *(ao4 + 3); - data28 = *(ao5 + 3); - data34 = *(ao6 + 3); - - data29 = *(ao5 + 4); - data35 = *(ao6 + 4); - - data36 = *(ao6 + 5); + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = n/6; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = m/6; + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + } else if (X > posY) { + // b[ 0] = ZERO; + // b[ 1] = ZERO; + // b[ 2] = ZERO; + // b[ 3] = ZERO; + // b[ 4] = ZERO; + // b[ 5] = ZERO; + // b[ 6] = ZERO; + // b[ 7] = ZERO; + // b[ 8] = ZERO; + // b[ 9] = ZERO; + // b[10] = ZERO; + // b[11] = ZERO; + // b[12] = ZERO; + // b[13] = ZERO; + // b[14] = ZERO; + // b[15] = ZERO; + // b[16] = ZERO; + // b[17] = ZERO; + // b[18] = ZERO; + // b[19] = ZERO; + // b[20] = ZERO; + // b[21] = ZERO; + // b[22] = ZERO; + // b[23] = ZERO; + // b[24] = ZERO; + // b[25] = ZERO; + // b[26] = ZERO; + // b[27] = ZERO; + // b[28] = ZERO; + // b[29] = ZERO; + // b[30] = ZERO; + // b[31] = ZERO; + // b[32] = ZERO; + // b[33] = ZERO; + // b[34] = ZERO; + // b[35] = ZERO; + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = *(ao3 + 1); + b[ 9] = *(ao4 + 1); + b[10] = *(ao5 + 1); + b[11] = *(ao6 + 1); + + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; +#else + b[ 14] = *(ao3 + 2); +#endif + b[ 15] = *(ao4 + 2); + b[ 16] = *(ao5 + 2); + b[ 17] = *(ao6 + 2); + + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; +#ifdef UNIT + b[ 21] = ONE; +#else + b[ 21] = *(ao4 + 3); +#endif + b[ 22] = *(ao5 + 3); + b[ 23] = *(ao6 + 3); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; +#else + b[ 28] = *(ao5 + 4); +#endif + b[ 29] = *(ao6 + 4); + + b[ 30] = ZERO; + b[ 31] = ZERO; + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; +#ifdef UNIT + b[ 35] = ONE; +#else + b[ 35] = *(ao6 + 5); +#endif + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } + X += 6; + i --; + } while (i > 0); + } + i = m % 6; + if (i) { + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + } else if (X > posY) { + b += 6 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + b += 6; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = *(ao3 + 1); + b[ 3] = *(ao4 + 1); + b[ 4] = *(ao5 + 1); + b[ 5] = *(ao6 + 1); + b += 6; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = *(ao4 + 2); + b[ 4] = *(ao5 + 2); + b[ 5] = *(ao6 + 2); + b += 6; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = *(ao5 + 3); + b[ 5] = *(ao6 + 3); + b += 6; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = *(ao6 + 4); + b += 6; + } + } + } + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 2; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = ZERO; #ifdef UNIT - b[ 0] = ONE; - b[ 1] = data07; - b[ 2] = data13; - b[ 3] = data19; - b[ 4] = data25; - b[ 5] = data31; - - b[ 6] = ZERO; - b[ 7] = ONE; - b[ 8] = data14; - b[ 9] = data20; - b[10] = data26; - b[11] = data32; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ONE; - b[15] = data21; - b[16] = data27; - b[17] = data33; - - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = ONE; - b[22] = data28; - b[23] = data34; - - b[24] = ZERO; - b[25] = ZERO; - b[26] = ZERO; - b[27] = ZERO; - b[28] = ONE; - b[29] = data35; - - b[30] = ZERO; - b[31] = ZERO; - b[32] = ZERO; - b[33] = ZERO; - b[34] = ZERO; - b[35] = ONE; + b[ 5] = ONE; #else - b[ 0] = data01; - b[ 1] = data07; - b[ 2] = data13; - b[ 3] = data19; - b[ 4] = data25; - b[ 5] = data31; - - b[ 6] = ZERO; - b[ 7] = data08; - b[ 8] = data14; - b[ 9] = data20; - b[10] = data26; - b[11] = data32; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = data15; - b[15] = data21; - b[16] = data27; - b[17] = data33; - - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = data22; - b[22] = data28; - b[23] = data34; - - b[24] = ZERO; - b[25] = ZERO; - b[26] = ZERO; - b[27] = ZERO; - b[28] = data29; - b[29] = data35; - - b[30] = ZERO; - b[31] = ZERO; - b[32] = ZERO; - b[33] = ZERO; - b[34] = ZERO; - b[35] = data36; + b[ 5] = *(ao2 + 1); #endif + b[ 6] = *(ao3 + 1); + b[ 7] = *(ao4 + 1); - ao1 += 6; - ao2 += 6; - ao3 += 6; - ao4 += 6; - ao5 += 6; - ao6 += 7; - - b += 36; - } - X += 6; - i --; - } while (i > 0); - } - mm = m - m/6; - if (mm & 4) { - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b[ 4] = data02; - b[ 5] = data06; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = data11; - b[11] = data15; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = data16; - - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } else - if (X > posY) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = ZERO; - b[ 5] = ZERO; - b[ 6] = ZERO; - b[ 7] = ZERO; - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ZERO; - b[11] = ZERO; - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ZERO; - b[16] = ZERO; - b[17] = ZERO; - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = ZERO; - b[22] = ZERO; - b[23] = ZERO; - - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - - b += 16; - } else { + b[ 8] = ZERO; + b[ 9] = ZERO; #ifdef UNIT - data05 = *(ao2 + 0); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - - b[ 4] = ZERO; - b[ 5] = ONE; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ONE; - b[11] = data15; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ONE; + b[ 10] = ONE; #else - data01 = *(ao1 + 0); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - - b[ 4] = ZERO; - b[ 5] = data06; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = data11; - b[11] = data15; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = data16; + b[ 10] = *(ao3 + 2); #endif - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - - b += 16; - } - X += 4; - } - - if (mm & 3) { - if (X < posY) { - if (mm & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao2 + 0); - data04 = *(ao2 + 1); - data05 = *(ao3 + 0); - data06 = *(ao3 + 1); - data07 = *(ao4 + 0); - data08 = *(ao4 + 1); - - b[ 0] = data01; - b[ 1] = data03; - b[ 2] = data05; - b[ 3] = data07; - b[ 4] = data02; - b[ 5] = data04; - b[ 6] = data06; - b[ 7] = data08; - - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (mm & 1) { - data01 = *(ao1 + 0); - data03 = *(ao2 + 0); - data05 = *(ao3 + 0); - data07 = *(ao4 + 0); - - b[ 0] = data01; - b[ 1] = data03; - b[ 2] = data05; - b[ 3] = data07; - - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X > posY) { - if (m & 2) { - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 8; - } - - if (m & 1) { - ao1 += lda; - b += 4; - } - - } else { + b[ 11] = *(ao4 + 2); + + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + X += 4; + i -= 2; + continue; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X > posY) { + /* ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; */ + b += 4 * i; + } else { #ifdef UNIT - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data15 = *(ao4 + 2); - } - - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = ONE; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ONE; - b[ 3] = data15; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data06 = *(ao2 + 1); - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data11 = *(ao3 + 2); - data15 = *(ao4 + 2); - } - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = data06; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = data11; - b[ 3] = data15; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data02; - b[ 3] = data06; - - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X > posY) { - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - - } else { + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b += 4; + } + } + + posY += 4; + } + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { #ifdef UNIT - data05 = *(ao2 + 0); + data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = ZERO; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = ZERO; - b[ 3] = data06; + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; #endif - ao1 += 2 * lda; - ao2 += 2 * lda; - - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X < posY) { - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - - b[ 0] = data01; - b[ 1] = data05; - ao1 += 1; - ao2 += 1; - b += 2; - } else - if (X > posY) { - ao1 += lda; - ao2 += lda; - b += 2; - } else { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { #ifdef UNIT - data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = data05; + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); - b[ 0] = data01; - b[ 1] = data05; + b[ 0] = data01; + b[ 1] = data05; #endif - ao1 += lda; - ao2 += lda; - b += 2; - } - } - - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - } - - i = m; - if (m > 0) { - do { - if (X < posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - ao1 += 1; - b += 1; - } else - if (X > posY) { - ao1 += lda; - b += 1; - } else { + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else if (X > posY) { + ao1 += lda; + b += 1; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - b += 1; - } + ao1 += lda; + b += 1; + } - X += 1; - i --; - } while (i > 0); - } - } + X += 1; + i --; + } while (i > 0); + } + } - return 0; + return 0; } diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c index 441f7338..e7ec4999 100644 --- a/kernel/generic/trmm_utcopy_6.c +++ b/kernel/generic/trmm_utcopy_6.c @@ -41,432 +41,510 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X; - - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *ao1, *ao2, *ao3, *ao4; - - js = (n >> 2); - - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - } - - i = (m >> 2); - if (i > 0) { - do { - if (X < posY) { - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } else - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = data11; - b[11] = data12; - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = data16; - - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - b += 16; - - } else { + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = (m / 6); + if (i > 0) { + do { + if (X < posY) { + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + + b += 36; + } else if (X > posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + b += 6; + } + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + } else { #ifdef UNIT - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - - b[ 4] = data05; - b[ 5] = ONE; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = ONE; - b[11] = ZERO; - - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = data11; - b[11] = ZERO; - - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = data16; + b[ 0] = *(ao1 + 0); #endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - - b += 16; - } - - X += 4; - i --; - } while (i > 0); - } - - i = (m & 3); - if (i) { - - if (X < posY) { - - if (m & 2) { - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (m & 1) { - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X > posY) { - if (m & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 8; - } - - if (m & 1) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - ao1 += lda; - b += 4; - } - - } else { + b[ 6] = *(ao2 + 0); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + + b[12] = *(ao3 + 0); + b[13] = *(ao3 + 1); +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(ao3 + 2); +#endif + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + + b[18] = *(ao4 + 0); + b[19] = *(ao4 + 1); + b[20] = *(ao4 + 2); +#ifdef UNIT + b[21] = ONE; +#else + b[21] = *(ao4 + 3); +#endif + b[22] = ZERO; + b[23] = ZERO; + + b[24] = *(ao5 + 0); + b[25] = *(ao5 + 1); + b[26] = *(ao5 + 2); + b[27] = *(ao5 + 3); +#ifdef UNIT + b[28] = ONE; +#else + b[28] = *(ao5 + 4); +#endif + b[29] = ZERO; + + b[30] = *(ao6 + 0); + b[31] = *(ao6 + 1); + b[32] = *(ao6 + 2); + b[33] = *(ao6 + 3); + b[34] = *(ao6 + 4); +#ifdef UNIT + b[35] = ONE; +#else + b[35] = *(ao6 + 5); +#endif + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } + + X += 6; + i --; + } while (i > 0); + } + + i = m % 6; + if (i > 0) { + if (X < posY) { + + ao1 += i; + ao2 += i; + ao3 += i; + ao4 += i; + ao5 += i; + ao6 += i; + b += 6 * i; + + } else if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + ao2 += lda; + ao3 += lda; + ao4 += lda; + ao5 += lda; + ao6 += lda; + b += 6; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + + if (i >= 2) { + b[ 0] = *(ao2 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 3) { + b[ 0] = *(ao3 + 0); + b[ 1] = *(ao3 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 4) { + b[ 0] = *(ao4 + 0); + b[ 1] = *(ao4 + 1); + b[ 2] = *(ao4 + 2); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 5) { + b[ 0] = *(ao5 + 0); + b[ 1] = *(ao5 + 1); + b[ 2] = *(ao5 + 2); + b[ 3] = *(ao5 + 3); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = ZERO; + b += 6; + } + } + } + + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } else if (X > posY) { + for (ii = 0; ii < 2; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + ao1 += lda; + b += 4; + } + + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(ao2 + 0); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(ao2 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(ao3 + 0); + b[ 9] = *(ao3 + 1); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(ao3 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(ao4 + 0); + b[ 13] = *(ao4 + 1); + b[ 14] = *(ao4 + 2); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + X += 4; + i -= 2; + continue; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X < posY) { + ao1 += i; + ao2 += i; + ao3 += i; + ao4 += i; + b += 4 * i; + } else if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + ao1 += lda; + b += 4; + } + ao2 += lda; + ao3 += lda; + ao4 += lda; + } else { #ifdef UNIT - if (i >= 2) { - data05 = *(ao2 + 0); - } - - if (i >= 3) { - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - } - - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - - if(i >= 2) { - b[ 0] = data05; - b[ 1] = ONE; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - } - - if (i >= 3) { - b[ 0] = data09; - b[ 1] = data10; - b[ 2] = ONE; - b[ 3] = ZERO; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - - if (i >= 2) { - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - } - - if (i >= 3) { - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - } - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - - if(i >= 2) { - b[ 0] = data05; - b[ 1] = data06; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - } - - if (i >= 3) { - b[ 0] = data09; - b[ 1] = data10; - b[ 2] = data11; - b[ 3] = ZERO; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X < posY) { - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data05; - b[ 3] = data06; - - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } else { + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 4; + } + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { #ifdef UNIT - data05 = *(ao2 + 0); + data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = data05; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = data05; - b[ 3] = data06; + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; #endif - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X < posY) { - ao1 += 2; - b += 2; - } else - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - - b[ 0] = data01; - b[ 1] = data02; - - ao1 += lda; - b += 2; - } else { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { #ifdef UNIT - b[ 0] = ONE; - b[ 1] = ZERO; + b[ 0] = ONE; + b[ 1] = ZERO; #else - data01 = *(ao1 + 0); + data01 = *(ao1 + 0); - b[ 0] = data01; - b[ 1] = ZERO; + b[ 0] = data01; + b[ 1] = ZERO; #endif - b += 2; - } - } - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - } - - i = m; - if (m > 0) { - do { - - if (X < posY) { - b += 1; - ao1 += 1; - } else - if (X > posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - ao1 += lda; - b += 1; - } else { + b += 2; + } + } + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b += 1; + ao1 += 1; + } else if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - b += 1; - } + ao1 += lda; + b += 1; + } - X += 1; - i --; - } while (i > 0); - } - } + X += 1; + i --; + } while (i > 0); + } + } - return 0; + return 0; } diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c index a37c50d1..b0cc7ba4 100644 --- a/kernel/generic/trsm_lncopy_6.c +++ b/kernel/generic/trsm_lncopy_6.c @@ -49,22 +49,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; - i = (m >> 2); ii = 0; + // i = (m >> 3); + i = (m / 6); while (i > 0) { if (ii == jj) { @@ -74,233 +87,562 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); #ifndef UNIT - data06 = *(a2 + 1); + data19 = *(a3 + 2); #endif - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); #ifndef UNIT - data11 = *(a3 + 2); + data28 = *(a4 + 3); #endif - data12 = *(a3 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); #ifndef UNIT - data16 = *(a4 + 3); + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + +#ifndef UNIT + data46 = *(a6 + 5); #endif *(b + 0) = INV(data01); - *(b + 4) = data02; - *(b + 5) = INV(data06); + *(b + 6) = data02; + *(b + 7) = INV(data10); - *(b + 8) = data03; - *(b + 9) = data07; - *(b + 10) = INV(data11); + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = INV(data19); + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = INV(data28); + + *(b + 24) = data05; + *(b + 25) = data13; + *(b + 26) = data21; + *(b + 27) = data29; + *(b + 28) = INV(data37); + + *(b + 30) = data06; + *(b + 31) = data14; + *(b + 32) = data22; + *(b + 33) = data30; + *(b + 34) = data38; + *(b + 35) = INV(data46); - *(b + 12) = data04; - *(b + 13) = data08; - *(b + 14) = data12; - *(b + 15) = INV(data16); } if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; + + *(b + 24) = data05; + *(b + 25) = data13; + *(b + 26) = data21; + *(b + 27) = data29; + *(b + 28) = data37; + *(b + 29) = data45; + + *(b + 30) = data06; + *(b + 31) = data14; + *(b + 32) = data22; + *(b + 33) = data30; + *(b + 34) = data38; + *(b + 35) = data46; + } + + a1 += 6; + a2 += 6; + a3 += 6; + a4 += 6; + a5 += 6; + a6 += 6; + a7 += 6; + a8 += 6; + b += 36; + + i --; + ii += 6; + } + + if (mmod6 & 4) { + if (ii == jj) { +#ifndef UNIT data01 = *(a1 + 0); +#endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + *(b + 6) = data02; + *(b + 7) = INV(data10); + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = INV(data19); + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); *(b + 0) = data01; - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; - *(b + 4) = data02; - *(b + 5) = data06; - *(b + 6) = data10; - *(b + 7) = data14; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; - *(b + 8) = data03; - *(b + 9) = data07; - *(b + 10) = data11; - *(b + 11) = data15; - *(b + 12) = data04; - *(b + 13) = data08; - *(b + 14) = data12; - *(b + 15) = data16; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; - b += 16; - - i --; + a5 += 4; + a6 += 4; + a7 += 4; + a8 += 4; + b += 24; ii += 4; } - if ((m & 2) != 0) { - - if (ii== jj) { + if (mmod6 & 2) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - *(b + 4) = data02; - *(b + 5) = INV(data06); + *(b + 6) = data02; + *(b + 7) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); - data05 = *(a3 + 0); - data06 = *(a3 + 1); - data07 = *(a4 + 0); - data08 = *(a4 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); *(b + 0) = data01; - *(b + 1) = data03; - *(b + 2) = data05; - *(b + 3) = data07; - *(b + 4) = data02; - *(b + 5) = data04; - *(b + 6) = data06; - *(b + 7) = data08; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; - b += 8; - + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 12; ii += 2; } - if ((m & 1) != 0) { + if (mmod6 & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + } + b += 6; + } + + a += 6 * lda; + jj += 6; + j --; + } + + if (nmod6 & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data10); + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = INV(data19); + + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = INV(data28); + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i -= 2; + ii += 4; + } + + else if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + i -- ; + ii += 2; + } + + else { + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + i -- ; + ii += 2; + } + } - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); - data03 = *(a3 + 0); - data04 = *(a4 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); *(b + 0) = data01; - *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; } - b += 4; + b += 4; } a += 4 * lda; jj += 4; - j --; } - if (n & 2) { + if (nmod6 & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; - i = (m >> 1); ii = 0; + i = (m >> 1); while (i > 0) { if (ii == jj) { - #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data02; - *(b + 3) = INV(data04); + *(b + 3) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; - *(b + 1) = data03; + *(b + 1) = data09; *(b + 2) = data02; - *(b + 3) = data04; + *(b + 3) = data10; } a1 += 2; a2 += 2; - b += 4; + b += 4; i --; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); + data09 = *(a2 + 0); + *(b + 0) = data01; - *(b + 1) = data02; + *(b + 1) = data09; } - b += 2; + b += 2; } + a += 2 * lda; jj += 2; } - if (n & 1) { + if (nmod6 & 1) { a1 = a + 0 * lda; - i = m; ii = 0; + i = m; while (i > 0) { if (ii == jj) { @@ -315,8 +657,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1+= 1; - b += 1; + a1 += 1; + b += 1; + i --; ii += 1; } diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c index 12043eb3..9cda3d72 100644 --- a/kernel/generic/trsm_ltcopy_6.c +++ b/kernel/generic/trsm_ltcopy_6.c @@ -49,22 +49,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6, k; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; - i = (m >> 2); ii = 0; + // i = (m >> 3); + i = (m / 6); while (i > 0) { if (ii == jj) { @@ -75,35 +88,65 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); #ifndef UNIT - data11 = *(a3 + 2); + data19 = *(a3 + 2); #endif - data12 = *(a3 + 3); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); #ifndef UNIT - data16 = *(a4 + 3); + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + +#ifndef UNIT + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + +#ifndef UNIT + data46 = *(a6 + 5); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; - *(b + 5) = INV(data06); - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 7) = INV(data10); + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; - *(b + 10) = INV(data11); - *(b + 11) = data12; + *(b + 14) = INV(data19); + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; - *(b + 15) = INV(data16); + *(b + 21) = INV(data28); + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 28) = INV(data37); + *(b + 29) = data38; + + *(b + 35) = INV(data46); } if (ii < jj) { @@ -111,21 +154,182 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data33; + *(b + 25) = data34; + *(b + 26) = data35; + *(b + 27) = data36; + *(b + 28) = data37; + *(b + 29) = data38; + *(b + 30) = data41; + *(b + 31) = data42; + *(b + 32) = data43; + *(b + 33) = data44; + *(b + 34) = data45; + *(b + 35) = data46; + } - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + a1 += 6 * lda; + a2 += 6 * lda; + a3 += 6 * lda; + a4 += 6 * lda; + a5 += 6 * lda; + a6 += 6 * lda; + a7 += 6 * lda; + a8 += 6 * lda; + b += 36; - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); + i --; + ii += 6; + } - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + if (mmod6 & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + + *(b + 7) = INV(data10); + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 14) = INV(data19); + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + + *(b + 21) = INV(data28); + *(b + 22) = data29; + *(b + 23) = data30; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -133,32 +337,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; - - *(b + 8) = data09; - *(b + 9) = data10; - *(b + 10) = data11; - *(b + 11) = data12; - *(b + 12) = data13; - *(b + 13) = data14; - *(b + 14) = data15; - *(b + 15) = data16; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; } a1 += 4 * lda; a2 += 4 * lda; - a3 += 4 * lda; - a4 += 4 * lda; - b += 16; + /* a3 += 4 * lda; + a4 += 4 * lda; */ + b += 24; - i --; ii += 4; } - if ((m & 2) != 0) { - - if (ii== jj) { + if (mmod6 & 2) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); @@ -166,22 +376,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; - *(b + 5) = INV(data06); - *(b + 6) = data07; - *(b + 7) = data08; - + *(b + 7) = INV(data10); + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; } if (ii < jj) { @@ -189,11 +406,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -201,20 +422,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; } a1 += 2 * lda; - a2 += 2 * lda; - b += 8; + // a2 += 2 * lda; + b += 12; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (mmod6 & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); @@ -222,38 +446,78 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; } - b += 4; + b += 6; } + a += 6; + jj += 6; + j --; + } + if (nmod6 & 4) { + + a1 = a; a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + jj += 4; - j --; } - if (n & 2) { + if (nmod6 & 2) { + a1 = a + 0 * lda; a2 = a + 1 * lda; - i = (m >> 1); ii = 0; + i = (m >> 1); while (i > 0) { if (ii == jj) { @@ -264,25 +528,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data02; - - *(b + 3) = INV(data04); + *(b + 3) = INV(data10); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 2) = data09; + *(b + 3) = data10; } a1 += 2 * lda; @@ -293,19 +556,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif + // data02 = *(a1 + 1); + *(b + 0) = INV(data01); + // *(b + 1) = data02; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); + *(b + 0) = data01; *(b + 1) = data02; } @@ -315,11 +581,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj += 2; } - if (n & 1) { + if (nmod6 & 1) { + a1 = a + 0 * lda; - i = m; ii = 0; + i = m; while (i > 0) { if (ii == jj) { @@ -334,12 +601,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1 += 1 * lda; + a1 += lda; b += 1; i --; ii += 1; } + } return 0; diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c index a1bb1e20..e20773da 100644 --- a/kernel/generic/trsm_uncopy_6.c +++ b/kernel/generic/trsm_uncopy_6.c @@ -36,7 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include #include "common.h" #ifndef UNIT @@ -49,22 +48,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + // a7 = a + 6 * lda; + // a8 = a + 7 * lda; - i = (m >> 2); ii = 0; + + // i = (m >> 3); + i = (m / 6); while (i > 0) { if (ii == jj) { @@ -73,188 +88,729 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); #endif - data05 = *(a2 + 0); + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data09 = *(a3 + 0); - data10 = *(a3 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); #ifndef UNIT - data11 = *(a3 + 2); + data19 = *(a3 + 2); #endif - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); #ifndef UNIT - data16 = *(a4 + 3); + data28 = *(a4 + 3); #endif - *(b + 0) = INV(data01); - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif - *(b + 5) = INV(data06); - *(b + 6) = data10; - *(b + 7) = data14; + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif - *(b + 10) = INV(data11); - *(b + 11) = data15; +// data49 = *(a7 + 0); +// data50 = *(a7 + 1); +// data51 = *(a7 + 2); +// data52 = *(a7 + 3); +// data53 = *(a7 + 4); +// data54 = *(a7 + 5); +// #ifndef UNIT +// data55 = *(a7 + 6); +// #endif +// +// data57 = *(a8 + 0); +// data58 = *(a8 + 1); +// data59 = *(a8 + 2); +// data60 = *(a8 + 3); +// data61 = *(a8 + 4); +// data62 = *(a8 + 5); +// data63 = *(a8 + 6); +// #ifndef UNIT +// data64 = *(a8 + 7); +// #endif - *(b + 15) = INV(data16); + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 7) = INV(data10); + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 14) = INV(data19); + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 21) = INV(data28); + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; + + *(b + 28) = INV(data37); + *(b + 29) = data45; + // *(b + 38) = data53; + // *(b + 39) = data61; + + *(b + 35) = INV(data46); + // *(b + 46) = data54; + // *(b + 47) = data62; + + // *(b + 54) = INV(data55); + // *(b + 55) = data63; + + // *(b + 63) = INV(data64); } if (ii < jj) { - data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + // data07 = *(a1 + 6); + // data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + // data15 = *(a2 + 6); + // data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + // data23 = *(a3 + 6); + // data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + // data31 = *(a4 + 6); + // data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + // data39 = *(a5 + 6); + // data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + // data47 = *(a6 + 6); + // data48 = *(a6 + 7); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data51 = *(a7 + 2); + // data52 = *(a7 + 3); + // data53 = *(a7 + 4); + // data54 = *(a7 + 5); + // data55 = *(a7 + 6); + // data56 = *(a7 + 7); + + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + // data59 = *(a8 + 2); + // data60 = *(a8 + 3); + // data61 = *(a8 + 4); + // data62 = *(a8 + 5); + // data63 = *(a8 + 6); + // data64 = *(a8 + 7); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; + + *(b + 24) = data05; + *(b + 25) = data13; + *(b + 26) = data21; + *(b + 27) = data29; + *(b + 28) = data37; + *(b + 29) = data45; + // *(b + 38) = data53; + // *(b + 39) = data61; + + *(b + 30) = data06; + *(b + 31) = data14; + *(b + 32) = data22; + *(b + 33) = data30; + *(b + 34) = data38; + *(b + 35) = data46; + // *(b + 46) = data54; + // *(b + 47) = data62; + + // *(b + 48) = data07; + // *(b + 49) = data15; + // *(b + 50) = data23; + // *(b + 51) = data31; + // *(b + 52) = data39; + // *(b + 53) = data47; + // *(b + 54) = data55; + // *(b + 55) = data63; + + // *(b + 56) = data08; + // *(b + 57) = data16; + // *(b + 58) = data24; + // *(b + 59) = data32; + // *(b + 60) = data40; + // *(b + 61) = data48; + // *(b + 62) = data56; + // *(b + 63) = data64; + } + + a1 += 6; + a2 += 6; + a3 += 6; + a4 += 6; + a5 += 6; + a6 += 6; + // a7 += 6; + // a8 += 6; + b += 36; - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); + i --; + ii += 6; + } - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + if (mmod6 & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data51 = *(a7 + 2); + // data52 = *(a7 + 3); + + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + // data59 = *(a8 + 2); + // data60 = *(a8 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 7) = INV(data10); + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 14) = INV(data19); + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 21) = INV(data28); + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data51 = *(a7 + 2); + // data52 = *(a7 + 3); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + // data59 = *(a8 + 2); + // data60 = *(a8 + 3); *(b + 0) = data01; - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; - *(b + 4) = data02; - *(b + 5) = data06; - *(b + 6) = data10; - *(b + 7) = data14; - - *(b + 8) = data03; - *(b + 9) = data07; - *(b + 10) = data11; - *(b + 11) = data15; - *(b + 12) = data04; - *(b + 13) = data08; - *(b + 14) = data12; - *(b + 15) = data16; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; - b += 16; - - i --; + a5 += 4; + a6 += 4; + // a7 += 4; + // a8 += 4; + b += 24; ii += 4; } - if ((m & 2) != 0) { + if (mmod6 & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); - if (ii== jj) { + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 7) = INV(data10); + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 12; + ii += 2; + } + + if (mmod6 & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + // data49 = *(a7 + 0); + // data57 = *(a8 + 0); - data05 = *(a2 + 0); + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + } + + if (ii < jj) { + data01 = *(a1 + 0); + // data02 = *(a1 + 1); + data09 = *(a2 + 0); + // data10 = *(a2 + 1); + data17 = *(a3 + 0); + // data18 = *(a3 + 1); + data25 = *(a4 + 0); + // data26 = *(a4 + 1); + + // // data33 = *(a5 + 0); + // data34 = *(a5 + 1); + // // data41 = *(a6 + 0); + // data42 = *(a6 + 1); + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + } + b += 6; + // ii += 1; + } + + a += 6 * lda; + jj += 6; + j --; + } + + + if (nmod6 & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data09 = *(a3 + 0); - data10 = *(a3 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif - data13 = *(a4 + 0); - data14 = *(a4 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif *(b + 0) = INV(data01); - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 5) = INV(data10); + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 10) = INV(data19); + *(b + 11) = data27; - *(b + 5) = INV(data06); - *(b + 6) = data10; - *(b + 7) = data14; + *(b + 15) = INV(data28); + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i -= 2; + ii += 4; } - if (ii < jj) { + else if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); - data05 = *(a3 + 0); - data06 = *(a3 + 1); - data07 = *(a4 + 0); - data08 = *(a4 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); *(b + 0) = data01; - *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; - *(b + 4) = data05; - *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + i -- ; + ii += 2; } + else{ + a1 += 2; a2 += 2; - b += 8; + a3 += 2; + a4 += 2; + b += 8; + i -- ; ii += 2; } + } - if ((m & 1) != 0) { + if (m & 1) { + if (ii == jj) { - if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif - - data05 = *(a2 + 0); - data09 = *(a3 + 0); - data13 = *(a4 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); *(b + 0) = INV(data01); - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); - data03 = *(a3 + 0); - data04 = *(a4 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); *(b + 0) = data01; - *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; } b += 4; + // ii += 1; } - a += 4 * lda; + a += 4 * lda; jj += 4; - j --; } - if (n & 2) { + if (nmod6 & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; - i = (m >> 1); ii = 0; + + i = (m >> 1); while (i > 0) { if (ii == jj) { @@ -263,68 +819,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); #endif - data03 = *(a2 + 0); + data09 = *(a2 + 0); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - *(b + 1) = data03; - *(b + 3) = INV(data04); + *(b + 1) = data09; + + *(b + 3) = INV(data10); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; - *(b + 1) = data03; + *(b + 1) = data09; *(b + 2) = data02; - *(b + 3) = data04; + *(b + 3) = data10; } a1 += 2; a2 += 2; - b += 4; + b += 4; i --; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { - + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif - - data03 = *(a2 + 0); + data09 = *(a2 + 0); *(b + 0) = INV(data01); - *(b + 1) = data03; + *(b + 1) = data09; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); + data09 = *(a2 + 0); + *(b + 0) = data01; - *(b + 1) = data02; + *(b + 1) = data09; } b += 2; + // ii += 1; } - a += 2 * lda; + + a += 2 * lda; jj += 2; } - if (n & 1) { + if (nmod6 & 1) { a1 = a + 0 * lda; - i = m; ii = 0; + + i = m; while (i > 0) { if (ii == jj) { @@ -339,10 +897,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1+= 1; - b += 1; + a1 += 1; + b += 1; i --; - ii += 1; + ii ++; } } diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c index f8361722..6afc0054 100644 --- a/kernel/generic/trsm_utcopy_6.c +++ b/kernel/generic/trsm_utcopy_6.c @@ -49,21 +49,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6, k; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; - i = (m >> 2); + // i = (m >> 3); + i = (m / 6); ii = 0; while (i > 0) { @@ -72,37 +85,67 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); #endif - data05 = *(a2 + 0); + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data09 = *(a3 + 0); - data10 = *(a3 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); #ifndef UNIT - data11 = *(a3 + 2); + data19 = *(a3 + 2); #endif - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); #ifndef UNIT - data16 = *(a4 + 3); + data28 = *(a4 + 3); #endif - *(b + 0) = INV(data01); + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif - *(b + 4) = data05; - *(b + 5) = INV(data06); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif - *(b + 8) = data09; - *(b + 9) = data10; - *(b + 10) = INV(data11); + *(b + 0) = INV(data01); - *(b + 12) = data13; - *(b + 13) = data14; - *(b + 14) = data15; - *(b + 15) = INV(data16); + *(b + 6) = data09; + *(b + 7) = INV(data10); + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = INV(data19); + + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = INV(data28); + + *(b + 24) = data33; + *(b + 25) = data34; + *(b + 26) = data35; + *(b + 27) = data36; + *(b + 28) = INV(data37); + + *(b + 30) = data41; + *(b + 31) = data42; + *(b + 32) = data43; + *(b + 33) = data44; + *(b + 34) = data45; + *(b + 35) = INV(data46); } if (ii > jj) { @@ -110,21 +153,166 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data33; + *(b + 25) = data34; + *(b + 26) = data35; + *(b + 27) = data36; + *(b + 28) = data37; + *(b + 29) = data38; + *(b + 30) = data41; + *(b + 31) = data42; + *(b + 32) = data43; + *(b + 33) = data44; + *(b + 34) = data45; + *(b + 35) = data46; - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + } - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); + a1 += 6 * lda; + a2 += 6 * lda; + a3 += 6 * lda; + a4 += 6 * lda; + a5 += 6 * lda; + a6 += 6 * lda; + a7 += 6 * lda; + a8 += 6 * lda; + b += 36; - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + i --; + ii += 6; + } + + if (mmod6 & 4) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 6) = data09; + *(b + 7) = INV(data10); + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = INV(data19); + + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -132,44 +320,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; - - *(b + 8) = data09; - *(b + 9) = data10; - *(b + 10) = data11; - *(b + 11) = data12; - *(b + 12) = data13; - *(b + 13) = data14; - *(b + 14) = data15; - *(b + 15) = data16; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; } a1 += 4 * lda; a2 += 4 * lda; - a3 += 4 * lda; - a4 += 4 * lda; - b += 16; - - i --; + /* a3 += 4 * lda; + a4 += 4 * lda; */ + b += 24; ii += 4; } - if ((m & 2) != 0) { - - if (ii== jj) { + if (mmod6 & 2) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif - data05 = *(a2 + 0); + + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - - *(b + 4) = data05; - *(b + 5) = INV(data06); + *(b + 6) = data09; + *(b + 7) = INV(data10); } if (ii > jj) { @@ -177,11 +370,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -189,46 +386,84 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; } a1 += 2 * lda; - a2 += 2 * lda; - b += 8; - + // a2 += 2 * lda; + b += 12; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (mmod6 & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; } - b += 4; + b += 6; } + a += 6; + jj += 6; + j --; + } + + if (nmod6 & 4) { + + a1 = a; a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + jj += 4; - j --; } - if (n & 2) { + + if (nmod6 & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; @@ -240,58 +475,58 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifndef UNIT data01 = *(a1 + 0); #endif - data03 = *(a2 + 0); + + data09 = *(a2 + 0); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - *(b + 2) = data03; - *(b + 3) = INV(data04); + *(b + 2) = data09; + *(b + 3) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 2) = data09; + *(b + 3) = data10; } a1 += 2 * lda; a2 += 2 * lda; b += 4; - i --; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); + *(b + 0) = data01; *(b + 1) = data02; } - b += 2; + b += 2; } a += 2; jj += 2; } - if (n & 1) { + if (nmod6 & 1) { a1 = a + 0 * lda; i = m; @@ -310,9 +545,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1 += 1 * lda; + a1 += lda; b += 1; - i --; ii += 1; } diff --git a/kernel/generic/zgemm3mkernel_dump.c b/kernel/generic/zgemm3mkernel_dump.c index a59bb08c..c4a614f0 100644 --- a/kernel/generic/zgemm3mkernel_dump.c +++ b/kernel/generic/zgemm3mkernel_dump.c @@ -25,10 +25,16 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if 1 +#include "zgemmkernel_2x2.c" + + +#else #include "common.h" int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc) { return 0; } +#endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 20d0769f..eff1581d 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -85,11 +85,11 @@ ZSWAPKERNEL = cswap_lasx.S CSUMKERNEL = csum_lasx.S ZSUMKERNEL = csum_lasx.S -DGEMMKERNEL = dgemm_kernel_16x4.S +DGEMMKERNEL = dgemm_kernel_16x6.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S -DGEMMONCOPY = dgemm_ncopy_4.S -DGEMMOTCOPY = dgemm_tcopy_4.S +DGEMMONCOPY = gemm_ncopy_6.prefx.c +DGEMMOTCOPY = dgemm_tcopy_6.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) @@ -153,13 +153,23 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S -DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S -DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S +DTRSMKERNEL_LN = trsm_kernel_LN_UNROLLN6.c +DTRSMKERNEL_LT = trsm_kernel_LT_UNROLLN6.c +DTRSMKERNEL_RN = trsm_kernel_RN_UNROLLN6.c +DTRSMKERNEL_RT = trsm_kernel_RT_UNROLLN6.c STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DGEMM_SMALL_M_PERMIT = dgemm_small_matrix_permit.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_lasx.S +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_lasx.S +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_lasx.S +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_lasx.S +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_lasx.S +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_lasx.S +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_lasx.S +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_lasx.S endif diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S index cf827379..a3626191 100644 --- a/kernel/loongarch64/cgemv_n_4_lsx.S +++ b/kernel/loongarch64/cgemv_n_4_lsx.S @@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -318,6 +318,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S index ba38a957..44e59d0a 100644 --- a/kernel/loongarch64/cgemv_n_8_lasx.S +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -336,7 +336,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -378,6 +378,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S index ada34936..6acf8c63 100644 --- a/kernel/loongarch64/cgemv_t_4_lsx.S +++ b/kernel/loongarch64/cgemv_t_4_lsx.S @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -285,6 +285,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ CGEMV_T_LSX GAP_1, X4_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_8_lasx.S b/kernel/loongarch64/cgemv_t_8_lasx.S index 94e4bd2e..f8a0ad12 100644 --- a/kernel/loongarch64/cgemv_t_8_lasx.S +++ b/kernel/loongarch64/cgemv_t_8_lasx.S @@ -304,7 +304,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -337,6 +337,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ CGEMV_T_LASX GAP_1, X8_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dgemm_kernel_16x6.S b/kernel/loongarch64/dgemm_kernel_16x6.S new file mode 100644 index 00000000..90da1073 --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_16x6.S @@ -0,0 +1,6256 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define C4 $r25 +#define C5 $r26 +#define T0 $r27 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r28 +#define T2 $r29 +#define I48 $r30 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define D0 $xr7 +#define D1 $xr8 +#define D2 $xr9 +#define D3 $xr10 +#define D4 $xr11 +#define D5 $xr12 +#define D6 $xr13 +#define D7 $xr14 +#define D8 $xr15 +#define D9 $xr16 +#define D10 $xr17 +#define D11 $xr18 +#define D12 $xr19 +#define D13 $xr20 +#define D14 $xr21 +#define D15 $xr22 +#define D16 $xr23 +#define D17 $xr24 +#define D18 $xr25 +#define D19 $xr26 +#define D20 $xr27 +#define D21 $xr28 +#define D22 $xr29 +#define D23 $xr30 +#define VALPHA $xr31 + +/* Prefetch interval */ +#define A_PRE 0x200 /* 0x200 / 0x80 = 4 */ +#define B_PRE 0x100 /* 0x100 / 0x30 = 4 */ + +.macro KERNEL_16x6 + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + xvldrepl.d U6, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x18 + xvldrepl.d U5, B0, 0x20 + xvldrepl.d U6, B0, 0x28 + + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + xvfmadd.d D16, U0, U5, D16 + xvfmadd.d D17, U1, U5, D17 + xvfmadd.d D18, U2, U5, D18 + xvfmadd.d D19, U3, U5, D19 + preld 0, A0, A_PRE + 0x40 + + xvfmadd.d D20, U0, U6, D20 + xvfmadd.d D21, U1, U6, D21 + xvfmadd.d D22, U2, U6, D22 + xvfmadd.d D23, U3, U6, D23 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x30 +.endm + + PROLOGUE + + addi.d $sp, $sp, -160 + /* Store $r23~$31 */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 56 + SDARG $r31, $sp, 64 + fst.d $f23, $sp, 72 + fst.d $f24, $sp, 80 + fst.d $f25, $sp, 96 + fst.d $f26, $sp, 104 + fst.d $f27, $sp, 112 + fst.d $f28, $sp, 120 + fst.d $f29, $sp, 128 + fst.d $f30, $sp, 136 + fst.d $f31, $sp, 144 + fst.d ALPHA, $sp, 152 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + addi.d I48, ZERO, 48 + /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ + xvld VALPHA, $sp, 152 + xvreplve0.d VALPHA, VALPHA + xor T0, T0, T0 + addi.d T0, T0, 6 + /* if (!(N / 6)) goto L_N5 */ + div.d J, N, T0 /* J = bn / 6 */ + mul.d T0, J, T0 + sub.d N, N, T0 + beq ZERO, J, .L_N5 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N / 6) && (M >> 4)) START !!! ************************* +* dgemm_core_16x6 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + add.d C4, C3, T0 + add.d C5, C4, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D23, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + xvldrepl.d U6, B0, 0x10 + + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 + + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 + + xvldrepl.d U4, B0, 0x18 + xvldrepl.d U5, B0, 0x20 + xvldrepl.d U6, B0, 0x28 + + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U4 + xvfmul.d D15, U3, U4 + + preld 0, C4, 0x00 + /* line 5 */ + xvfmul.d D16, U0, U5 + xvfmul.d D17, U1, U5 + preld 0, C4, 0x40 + xvfmul.d D18, U2, U5 + xvfmul.d D19, U3, U5 + + preld 0, C5, 0x00 + /* line 6 */ + xvfmul.d D20, U0, U6 + xvfmul.d D21, U1, U6 + preld 0, C5, 0x40 + xvfmul.d D22, U2, U6 + xvfmul.d D23, U3, U6 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + /* Calculate 8 sets of D0~D23 */ +.L_TL1: /* TL-- */ + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + + /* Maybe we need calculate the last + * 7 sets of D0~D23? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + xvfmadd.d D18, U2, U4, D18 + xvfmadd.d D19, U3, U4, D19 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + xvfmadd.d D22, U2, U4, D22 + xvfmadd.d D23, U3, U4, D23 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D17, D17, VALPHA + xvfmul.d D18, D18, VALPHA + xvfmul.d D19, D19, VALPHA + xvfmul.d D20, D20, VALPHA + xvfmul.d D21, D21, VALPHA + xvfmul.d D22, D22, VALPHA + xvfmul.d D23, D23, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvld U2, C2, 0x40 + xvld U3, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + xvfmadd.d D10, D10, VALPHA, U2 + xvfmadd.d D11, D11, VALPHA, U3 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvld U1, C4, 0x20 + xvld U2, C4, 0x40 + xvld U3, C4, 0x60 + xvfmadd.d D16, D16, VALPHA, U0 + xvfmadd.d D17, D17, VALPHA, U1 + xvfmadd.d D18, D18, VALPHA, U2 + xvfmadd.d D19, D19, VALPHA, U3 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvld U1, C5, 0x20 + xvld U2, C5, 0x40 + xvld U3, C5, 0x60 + xvfmadd.d D20, D20, VALPHA, U0 + xvfmadd.d D21, D21, VALPHA, U1 + xvfmadd.d D22, D22, VALPHA, U2 + xvfmadd.d D23, D23, VALPHA, U3 + #endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 + /* Store C4 */ + xvst D16, C4, 0x00 + xvst D17, C4, 0x20 + xvst D18, C4, 0x40 + xvst D19, C4, 0x60 + /* Store C5 */ + xvst D20, C5, 0x00 + xvst D21, C5, 0x20 + xvst D22, C5, 0x40 + xvst D23, C5, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 + addi.d C4, C4, 0x80 + addi.d C5, C5, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -16 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + xvfmul.d D17, U1, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + xvfmul.d D21, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 + +.L_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 + +.L_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M8_L71 + +.L_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D17, D17, VALPHA + xvfmul.d D20, D20, VALPHA + xvfmul.d D21, D21, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvld U1, C4, 0x20 + xvfmadd.d D16, D16, VALPHA, U0 + xvfmadd.d D17, D17, VALPHA, U1 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvld U1, C5, 0x20 + xvfmadd.d D20, D20, VALPHA, U0 + xvfmadd.d D21, D21, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + /* Store C4 */ + xvst D16, C4, 0x00 + xvst D17, C4, 0x20 + /* Store C5 */ + xvst D20, C5, 0x00 + xvst D21, C5, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + addi.d C4, C4, 0x40 + addi.d C5, C5, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 8)) End************/ + +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + +.L_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M4_TL1 + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D20, D20, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvfmadd.d D16, D16, VALPHA, U0 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvfmadd.d D20, D20, VALPHA, U0 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 + /* Store C4 */ + xvst D16, C4, 0x00 + /* Store C5 */ + xvst D20, C5, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + addi.d C4, C4, 0x20 + addi.d C5, C5, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + +.L_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D20, D20, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvfmadd.d D16, D16, VALPHA, U0 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvfmadd.d D20, D20, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D16, C4, 0x00, 0x00 + xvstelm.d D20, C5, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + xvstelm.d D8, C2, 0x08, 0x01 + xvstelm.d D12, C3, 0x08, 0x01 + xvstelm.d D16, C4, 0x08, 0x01 + xvstelm.d D20, C5, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + addi.d C4, C4, 0x10 + addi.d C5, C5, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + +.L_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: +#ifdef TRMMKERNEL + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D20, D20, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvfmadd.d D16, D16, VALPHA, U0 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvfmadd.d D20, D20, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D16, C4, 0x00, 0x00 + xvstelm.d D20, C5, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + addi.d C4, C4, 0x08 + addi.d C5, C5, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 6) + * C += (LDC * 6) + */ + /* since the array type is double, + * so we must mul 48 + */ + addi.d T2, ZERO,48 + mul.d T0, K, T2 + mul.d T1, LDC, T2 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x06 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + +.L_N5: + andi J, N, 4 + beq ZERO, J, .L_N3 + +/************************* Condition 2 if((N & 4) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N5_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N5_M8 + +.L_N5_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + xvfmul.d D10, U2, U4 + xvfmul.d D11, U3, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + xvfmul.d D14, U2, U4 + xvfmul.d D15, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_L7 */ + beq ZERO,TL, .L_N5_L7 + +.L_N5_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_TL1 + +.L_N5_L7: + /* if (!(L & 7)) goto L_N5_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_L0 + +.L_N5_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_L71 + +.L_N5_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvld U2, C2, 0x40 + xvld U3, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + xvfmadd.d D10, D10, VALPHA, U2 + xvfmadd.d D11, D11, VALPHA, U3 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 + #endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -16 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N5_I1 + +.L_N5_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N5_M0 + + andi I, M, 8 + beq ZERO,I, .L_N5_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M8_L7 */ + beq ZERO,TL, .L_N5_M8_L7 + +.L_N5_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M8_TL1 + +.L_N5_M8_L7: + /* if (!(L & 7)) goto L_N5_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M8_L0 + +.L_N5_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M8_L71 + +.L_N5_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 8) ) End************/ + +.L_N5_M4: + andi I, M, 4 + beq ZERO,I, .L_N5_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M4_L7 */ + beq ZERO,TL, .L_N5_M4_L7 + +.L_N5_M4_TL1: /* TL-- */ + /***8-1***/ + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M4_TL1 + +.L_N5_M4_L7: + /* if (!(L & 7)) goto L_N5_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M4_L0 + +.L_N5_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M4_L71 + +.L_N5_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 4) ) End************/ + +.L_N5_M2: + andi I, M, 2 + beq ZERO,I, .L_N5_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M2_L7 */ + beq ZERO,TL, .L_N5_M2_L7 + +.L_N5_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M2_TL1 + +.L_N5_M2_L7: + /* if (!(L & 7)) goto L_N5_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M2_L0 + +.L_N5_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M2_L71 + +.L_N5_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + #else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + #endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + xvstelm.d D8, C2, 0x08, 0x01 + xvstelm.d D12, C3, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 2) ) End************/ + +.L_N5_M1: + andi I, M, 1 + beq ZERO,I, .L_N5_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M1_L7 */ + beq ZERO,TL, .L_N5_M1_L7 + +.L_N5_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M1_TL1 + +.L_N5_M1_L7: + /* if (!(L & 7)) goto L_N5_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M1_L0 + +.L_N5_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M1_L71 + +.L_N5_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + #else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 1) ) End************/ + +.L_N5_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + addi.d T2, ZERO,32 + mul.d T0, K, T2 + mul.d T1, LDC, T2 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 4) && (M >> 4)) End !!! ************************* +* dgemm_core_16x4 */ + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 3 if((N & 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N3_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + +.L_N3_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N3_M0 + + andi I, M, 8 + beq ZERO,I, .L_N3_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M8_L7 */ + beq ZERO,TL, .L_N3_M8_L7 + +.L_N3_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M8_TL1 + +.L_N3_M8_L7: + /* if (!(L & 7)) goto L_N3_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M8_L0 + +.L_N3_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M8_L71 + +.L_N3_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2) && (M & 8) ) End************/ + +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + +.L_N3_M4_TL1: /* TL-- */ + /***8-1***/ + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + +.L_N3_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + +.L_N3_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 32 + */ + addi.d T2, ZERO,16 + mul.d T0, K, T2 + mul.d T1, LDC, T2 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 3 if((N & 2) && (M >> 4)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 4 if((N & 1) && (M >> 4)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N1_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + +.L_N1_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 + +.L_N1_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 + +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 + +.L_N1_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 + +.L_N1_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2) && (M & 8) ) End************/ + +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + +.L_N1_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + #endif + + /* Store C0 */ + xvst D0, C0, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + +.L_N1_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + +.L_N1_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 4 if((N & 1) && (M >> 4)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore $r23~$31 */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 56 + LDARG $r31, $sp, 64 + fld.d $f23, $sp, 72 + fld.d $f24, $sp, 80 + fld.d $f25, $sp, 96 + fld.d $f26, $sp, 104 + fld.d $f27, $sp, 112 + fld.d $f28, $sp, 120 + fld.d $f29, $sp, 128 + fld.d $f30, $sp, 136 + fld.d $f31, $sp, 144 + addi.d $sp, $sp, 160 + + /* Back home */ + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S index 95c87903..4c32e0ec 100644 --- a/kernel/loongarch64/dgemm_ncopy_16.S +++ b/kernel/loongarch64/dgemm_ncopy_16.S @@ -655,6 +655,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d TD, TD, 0x10 .L_N1: + andi J, N, 0x01 + beq ZERO, J, .L_N0 move S1, TS beq ZERO, M, .L_N0 diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S index 30bebe8d..4ca48550 100644 --- a/kernel/loongarch64/dgemm_ncopy_8_lsx.S +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define D7 $vr15 PROLOGUE - push_if_used 26, 32 + push_if_used 0, 0 move TD, DST move TS, SRC slli.d TL, LDA, 0x03 @@ -268,6 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d S2, S2, 0x08 addi.d TD, TD, 0x10 .L_N1: + andi J, N, 0x01 + beq ZERO, J, .L_N0 move S1, TS beq ZERO, M, .L_N0 .L_M1: @@ -278,6 +280,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d M, M, -1 blt ZERO, M, .L_M1 .L_N0: - pop_if_used 26, 32 + pop_if_used 0, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S new file mode 100644 index 00000000..a50350dd --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S @@ -0,0 +1,549 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M16 $t1 +#define M8 $t1 +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K8 $t3 +#define A0 $t4 +#define X0 $t5 +#define B1 $t6 +#define B2 $t7 +#define B3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define D4 $xr6 +#define D5 $xr7 +#define D6 $xr8 +#define D7 $xr9 +#define D8 $xr10 +#define D9 $xr11 +#define D10 $xr12 +#define D11 $xr13 +#define D12 $xr14 +#define D13 $xr15 +#define D14 $xr16 +#define D15 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + +.macro DGEMM_SMALL_KERNEL_NN_TAIL M + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M\M\()_N3 +.L_M\M\()_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M\M\()_N4_END +.L_M\M\()_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N4_K1 +.L_M\M\()_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 + GLD xv, , S0, C2, 0x00 + GMADD xvf, d, D2, S0, VBETA, D2 + GLD xv, , S0, C3, 0x00 + GMADD xvf, d, D3, S0, VBETA, D3 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00 +.endif + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M\M\()_N4 +.L_M\M\()_N3: + andi N2, N, 0x02 + beqz N2, .L_M\M\()_N1 +.L_M\M\()_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N2_END +.L_M\M\()_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N2_K1 +.L_M\M\()_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00 +.endif + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M\M\()_N1: + andi N1, N, 0x01 + beqz N1, .L_M\M\()_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N1_END +.L_M\M\()_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N1_K1 +.L_M\M\()_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00 +.endif +.L_M\M\()_END: +.if \M == 4 + PTR_ADDI A, A, 0x20 + PTR_ADDI C, C, 0x20 +.elseif \M == 2 + PTR_ADDI A, A, 0x10 + PTR_ADDI C, C, 0x10 +.elseif \M == 1 +.endif +.endm + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 5, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_SLLI K8, K, 3 + PTR_SRAI M16, M, 4 // M >> 4 + beqz M16, .L_M15 +.L_M16: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M16_N3 +.L_M16_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \ + D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \ + D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M16_N4_END +.L_M16_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \ + D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \ + D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N4_K1 + .L_M16_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60 + GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60 + GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15 +#endif + GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \ + D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \ + D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M16_N4 +.L_M16_N3: + andi N2, N, 0x02 + beqz N2, .L_M16_N1 +.L_M16_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N2_END +.L_M16_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N2_K1 +.L_M16_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 +#endif + GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M16_N1: + andi N1, N, 0x01 + beqz N1, .L_M16_END + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N1_END +.L_M16_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N1_K1 +.L_M16_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A +.L_M16_END: + PTR_ADDI M16, M16, -1 + PTR_ADDI A, A, 0x80 + PTR_ADDI C, C, 0x80 + bnez M16, .L_M16 +.L_M15: + andi M8, M, 0x08 + beqz M8, .L_M7 +.L_M8: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M8_N3 +.L_M8_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M8_N4_END +.L_M8_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3, \ + D4, S0, Z2, D4, D5, S1, Z2, D5, \ + D6, S0, Z3, D6, D7, S1, Z3, D7, + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N4_K1 +.L_M8_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20 + GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7 +#endif + GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M8_N4 +.L_M8_N3: + andi N2, N, 0x02 + beqz N2, .L_M8_N1 +.L_M8_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N2_END +.L_M8_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N2_K1 +.L_M8_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M8_N1: + andi N1, N, 0x01 + beqz N1, .L_M8_END + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N1_END +.L_M8_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N1_K1 +.L_M8_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +.L_M8_END: + PTR_ADDI A, A, 0x40 + PTR_ADDI C, C, 0x40 +.L_M7: + andi M4, M, 0x04 + beqz M4, .L_M3 +.L_M4: + DGEMM_SMALL_KERNEL_NN_TAIL 4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + DGEMM_SMALL_KERNEL_NN_TAIL 2 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + DGEMM_SMALL_KERNEL_NN_TAIL 1 +.L_M0: + pop_if_used 5, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_kernel_nt_lasx.S b/kernel/loongarch64/dgemm_small_kernel_nt_lasx.S new file mode 100644 index 00000000..aee0586f --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_nt_lasx.S @@ -0,0 +1,500 @@ +/*************************************************************************** +Copyright (c) 2024 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M16 $t1 +#define M8 $t1 +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K_LDB $t3 +#define A0 $t4 +#define X0 $t5 +#define C0 $t6 +#define C1 $t7 +#define C2 $t8 +#define C3 $s0 +#define K1 $s1 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define D4 $xr6 +#define D5 $xr7 +#define D6 $xr8 +#define D7 $xr9 +#define D8 $xr10 +#define D9 $xr11 +#define D10 $xr12 +#define D11 $xr13 +#define D12 $xr14 +#define D13 $xr15 +#define D14 $xr16 +#define D15 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + +.macro DGEMM_SMALL_KERNEL_NT_TAIL M + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M\M\()_N3 +.L_M\M\()_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M\M\()_N4_END +.L_M\M\()_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N4_K1 +.L_M\M\()_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 + GLD xv, , S0, C2, 0x00 + GMADD xvf, d, D2, S0, VBETA, D2 + GLD xv, , S0, C3, 0x00 + GMADD xvf, d, D3, S0, VBETA, D3 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00 +.endif + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0 + move A0, A + bnez N4, .L_M\M\()_N4 +.L_M\M\()_N3: + andi N2, N, 0x02 + beqz N2, .L_M\M\()_N1 +.L_M\M\()_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N2_END +.L_M\M\()_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N2_K1 +.L_M\M\()_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00 +.endif + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M\M\()_N1: + andi N1, N, 0x01 + beqz N1, .L_M\M\()_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N1_END +.L_M\M\()_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N1_K1 +.L_M\M\()_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00 +.endif +.L_M\M\()_END: +.if \M == 4 + PTR_ADDI A, A, 0x20 + PTR_ADDI C, C, 0x20 +.elseif \M == 2 + PTR_ADDI A, A, 0x10 + PTR_ADDI C, C, 0x10 +.elseif \M == 1 +.endif +.endm + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 2, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_MUL K_LDB, K, LDB + PTR_SRAI M16, M, 4 // M >> 4 + beqz M16, .L_M15 +.L_M16: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M16_N3 +.L_M16_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \ + D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \ + D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M16_N4_END +.L_M16_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \ + D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \ + D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N4_K1 + .L_M16_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60 + GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60 + GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15 +#endif + GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \ + D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \ + D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0 + move A0, A + bnez N4, .L_M16_N4 +.L_M16_N3: + andi N2, N, 0x02 + beqz N2, .L_M16_N1 +.L_M16_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N2_END +.L_M16_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N2_K1 +.L_M16_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 +#endif + GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M16_N1: + andi N1, N, 0x01 + beqz N1, .L_M16_END + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N1_END +.L_M16_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N1_K1 +.L_M16_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0 + move A0, A +.L_M16_END: + PTR_ADDI M16, M16, -1 + PTR_ADDI A, A, 0x80 + PTR_ADDI C, C, 0x80 + bnez M16, .L_M16 +.L_M15: + andi M8, M, 0x08 + beqz M8, .L_M7 +.L_M8: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M8_N3 +.L_M8_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M8_N4_END +.L_M8_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3, \ + D4, S0, Z2, D4, D5, S1, Z2, D5, \ + D6, S0, Z3, D6, D7, S1, Z3, D7, + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N4_K1 +.L_M8_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20 + GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7 +#endif + GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0 + move A0, A + bnez N4, .L_M8_N4 +.L_M8_N3: + andi N2, N, 0x02 + beqz N2, .L_M8_N1 +.L_M8_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N2_END +.L_M8_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N2_K1 +.L_M8_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M8_N1: + andi N1, N, 0x01 + beqz N1, .L_M8_END + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N1_END +.L_M8_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N1_K1 +.L_M8_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +.L_M8_END: + PTR_ADDI A, A, 0x40 + PTR_ADDI C, C, 0x40 +.L_M7: + andi M4, M, 0x04 + beqz M4, .L_M3 +.L_M4: + DGEMM_SMALL_KERNEL_NT_TAIL 4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + DGEMM_SMALL_KERNEL_NT_TAIL 2 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + DGEMM_SMALL_KERNEL_NT_TAIL 1 +.L_M0: + pop_if_used 2, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_kernel_tn_lasx.S b/kernel/loongarch64/dgemm_small_kernel_tn_lasx.S new file mode 100644 index 00000000..b1e58860 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_tn_lasx.S @@ -0,0 +1,639 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K8 $t3 +#define A0 $t4 +#define X0 $t5 +#define B1 $t6 +#define B2 $t7 +#define B3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 +#define A1 $s5 +#define A2 $s6 +#define A3 $s7 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define T0 $xr6 +#define T1 $xr7 +#define T2 $xr8 +#define T3 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define Y2 $xr12 +#define Y3 $xr13 +#define G0 $xr14 +#define G1 $xr15 +#define G2 $xr16 +#define G3 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 8, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_SLLI K8, K, 3 + PTR_SRAI M4, M, 2 // M >> 2 + beqz M4, .L_M3 +.L_M4: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M4_N3 +.L_M4_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M4_N4_END + PTR_SRAI K1, K1, 3 + beq ZERO, K1, .L_M4_N4_K7 +.L_M4_N4_K8: + PTR_ADDI K1, K1, -1 + GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08 + GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10 + GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + GMADD xvf, d, D0, S1, T0, D0, \ + D1, S1, T1, D1, \ + D2, S1, T2, D2, \ + D3, S1, T3, D3 + GMADD xvf, d, D0, S2, Y0, D0, \ + D1, S2, Y1, D1, \ + D2, S2, Y2, D2, \ + D3, S2, Y3, D3 + GMADD xvf, d, D0, S3, G0, D0, \ + D1, S3, G1, D1, \ + D2, S3, G2, D2, \ + D3, S3, G3, D3 + + GLD xv, , T0, A0, 0x20, T1, A1, 0x20, T2, A2, 0x20, T3, A3, 0x20 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + + GLDREPL xv, d, Z0, X0, 0x20, Z1, B1, 0x20, Z2, B2, 0x20, Z3, B3, 0x20 + GLDREPL xv, d, T0, X0, 0x28, T1, B1, 0x28, T2, B2, 0x28, T3, B3, 0x28 + GLDREPL xv, d, Y0, X0, 0x30, Y1, B1, 0x30, Y2, B2, 0x30, Y3, B3, 0x30 + GLDREPL xv, d, G0, X0, 0x38, G1, B1, 0x38, G2, B2, 0x38, G3, B3, 0x38 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + GMADD xvf, d, D0, S1, T0, D0, \ + D1, S1, T1, D1, \ + D2, S1, T2, D2, \ + D3, S1, T3, D3 + GMADD xvf, d, D0, S2, Y0, D0, \ + D1, S2, Y1, D1, \ + D2, S2, Y2, D2, \ + D3, S2, Y3, D3 + GMADD xvf, d, D0, S3, G0, D0, \ + D1, S3, G1, D1, \ + D2, S3, G2, D2, \ + D3, S3, G3, D3 + + PTR_ADDI X0, X0, 0x40 + PTR_ADDI B1, B1, 0x40 + PTR_ADDI B2, B2, 0x40 + PTR_ADDI B3, B3, 0x40 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI A1, A1, 0x40 + PTR_ADDI A2, A2, 0x40 + PTR_ADDI A3, A3, 0x40 + bnez K1, .L_M4_N4_K8 + .L_M4_N4_K7: + andi K1, K, 4 + beqz K1, .L_M4_N4_3 + .L_M4_N4_K4: + GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08 + GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10 + GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + GMADD xvf, d, D0, S1, T0, D0, \ + D1, S1, T1, D1, \ + D2, S1, T2, D2, \ + D3, S1, T3, D3 + GMADD xvf, d, D0, S2, Y0, D0, \ + D1, S2, Y1, D1, \ + D2, S2, Y2, D2, \ + D3, S2, Y3, D3 + GMADD xvf, d, D0, S3, G0, D0, \ + D1, S3, G1, D1, \ + D2, S3, G2, D2, \ + D3, S3, G3, D3 + PTR_ADDI X0, X0, 0x20 + PTR_ADDI B1, B1, 0x20 + PTR_ADDI B2, B2, 0x20 + PTR_ADDI B3, B3, 0x20 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI A1, A1, 0x20 + PTR_ADDI A2, A2, 0x20 + PTR_ADDI A3, A3, 0x20 + .L_M4_N4_3: + andi K1, K, 3 + beqz K1, .L_M4_N4_END + .L_M4_N4_K1: + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADDI K1, K1, -1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N4_K1 + .L_M4_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, \ + D1, S1, VBETA, D1, \ + D2, S2, VBETA, D2, \ + D3, S3, VBETA, D3 +#endif + GST xv, , D3, C3, 0x00, \ + D2, C2, 0x00, \ + D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0, A1, A2, A3 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + bnez N4, .L_M4_N4 +.L_M4_N3: + andi N2, N, 0x02 + beqz N2, .L_M4_N1 +.L_M4_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N2_END +.L_M4_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N2_K1 +.L_M4_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_N1: + andi N1, N, 0x01 + beqz N1, .L_M4_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N1_END +.L_M4_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N1_K1 +.L_M4_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST xv, , D0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_END: + PTR_ADDI M4, M4, -1 + PTR_ALSL A, LDA, A, 2 // A += LDA << 2; + PTR_ADDI C, C, 0x20 + bnez M4, .L_M4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M2_N3 +.L_M2_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M2_N4_END +.L_M2_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N4_K1 + .L_M2_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST v, , V3, C3, 0x00, \ + V2, C2, 0x00, \ + V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA + bnez N4, .L_M2_N4 +.L_M2_N3: + andi N2, N, 0x02 + beqz N2, .L_M2_N1 +.L_M2_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N2_END +.L_M2_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N2_K1 +.L_M2_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST v, , V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_N1: + andi N1, N, 0x01 + beqz N1, .L_M2_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N1_END +.L_M2_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N1_K1 +.L_M2_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST v, , V0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_END: + PTR_ALSL A, LDA, A, 1 // A += LDA << 1; + PTR_ADDI C, C, 0x10 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M1_N3 +.L_M1_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M1_N4_END +.L_M1_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N4_K1 + .L_M1_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST f, d, F3, C3, 0x00, \ + F2, C2, 0x00, \ + F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0, A1 + move A0, A + bnez N4, .L_M1_N4 +.L_M1_N3: + andi N2, N, 0x02 + beqz N2, .L_M1_N1 +.L_M1_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N2_END +.L_M1_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N2_K1 +.L_M1_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST f, d, F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M1_N1: + andi N1, N, 0x01 + beqz N1, .L_M0 + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N1_END +.L_M1_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N1_K1 +.L_M1_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST f, d, F0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A +.L_M0: + pop_if_used 8, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_kernel_tt_lasx.S b/kernel/loongarch64/dgemm_small_kernel_tt_lasx.S new file mode 100644 index 00000000..b3e33851 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_tt_lasx.S @@ -0,0 +1,534 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K_LDB $t3 +#define A0 $t4 +#define X0 $t5 +#define A1 $t6 +#define A2 $t7 +#define A3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 +#define B1 $s5 +#define B2 $s6 +#define B3 $s7 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define T0 $xr6 +#define T1 $xr7 +#define T2 $xr8 +#define T3 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define Y2 $xr12 +#define Y3 $xr13 +#define G0 $xr14 +#define G1 $xr15 +#define G2 $xr16 +#define G3 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 8, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_MUL K_LDB, K, LDB + PTR_SRAI M4, M, 2 // M >> 2 + beqz M4, .L_M3 +.L_M4: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M4_N3 +.L_M4_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M4_N4_END + PTR_SRAI K1, K1, 2 + beq ZERO, K1, .L_M4_N4_K3 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB +.L_M4_N4_K4: + PTR_ADDI K1, K1, -1 + GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08 + GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10 + GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, T0, D1, \ + D2, S0, Y0, D2, \ + D3, S0, G0, D3 + GMADD xvf, d, D0, S1, Z1, D0, \ + D1, S1, T1, D1, \ + D2, S1, Y1, D2, \ + D3, S1, G1, D3 + GMADD xvf, d, D0, S2, Z2, D0, \ + D1, S2, T2, D1, \ + D2, S2, Y2, D2, \ + D3, S2, G2, D3 + GMADD xvf, d, D0, S3, Z3, D0, \ + D1, S3, T3, D1, \ + D2, S3, Y3, D2, \ + D3, S3, G3, D3 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + + PTR_ADDI A0, A0, 0x20 + PTR_ADDI A1, A1, 0x20 + PTR_ADDI A2, A2, 0x20 + PTR_ADDI A3, A3, 0x20 + bnez K1, .L_M4_N4_K4 +.L_M4_N4_K3: + andi K1, K, 3 + beqz K1, .L_M4_N4_END +.L_M4_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N4_K1 + .L_M4_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, \ + D1, S1, VBETA, D1, \ + D2, S2, VBETA, D2, \ + D3, S3, VBETA, D3 +#endif + GST xv, , D3, C3, 0x00, \ + D2, C2, 0x00, \ + D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0, A1, A2, A3 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + bnez N4, .L_M4_N4 +.L_M4_N3: + andi N2, N, 0x02 + beqz N2, .L_M4_N1 +.L_M4_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N2_END +.L_M4_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N2_K1 +.L_M4_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_N1: + andi N1, N, 0x01 + beqz N1, .L_M4_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N1_END +.L_M4_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N1_K1 +.L_M4_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST xv, , D0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_END: + PTR_ADDI M4, M4, -1 + PTR_ALSL A, LDA, A, 2 // A += LDA << 2; + PTR_ADDI C, C, 0x20 + bnez M4, .L_M4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M2_N3 +.L_M2_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M2_N4_END +.L_M2_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N4_K1 + .L_M2_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST v, , V3, C3, 0x00, \ + V2, C2, 0x00, \ + V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA + bnez N4, .L_M2_N4 +.L_M2_N3: + andi N2, N, 0x02 + beqz N2, .L_M2_N1 +.L_M2_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N2_END +.L_M2_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N2_K1 +.L_M2_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST v, , V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_N1: + andi N1, N, 0x01 + beqz N1, .L_M2_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N1_END +.L_M2_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N1_K1 +.L_M2_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST v, , V0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_END: + PTR_ALSL A, LDA, A, 1 // A += LDA << 1; + PTR_ADDI C, C, 0x10 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M1_N3 +.L_M1_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M1_N4_END +.L_M1_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N4_K1 + .L_M1_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST f, d, F3, C3, 0x00, \ + F2, C2, 0x00, \ + F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0, A1 + move A0, A + bnez N4, .L_M1_N4 +.L_M1_N3: + andi N2, N, 0x02 + beqz N2, .L_M1_N1 +.L_M1_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N2_END +.L_M1_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N2_K1 +.L_M1_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST f, d, F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M1_N1: + andi N1, N, 0x01 + beqz N1, .L_M0 + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N1_END +.L_M1_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N1_K1 +.L_M1_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST f, d, F0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0 + move A0, A +.L_M0: + pop_if_used 8, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_matrix_permit.c b/kernel/loongarch64/dgemm_small_matrix_permit.c new file mode 100644 index 00000000..df262a6b --- /dev/null +++ b/kernel/loongarch64/dgemm_small_matrix_permit.c @@ -0,0 +1,44 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + + if (transa) { + if (MNK <= 24.0 * 24.0 * 24.0) + return 1; + } else { + if (MNK <= 64.0 * 64.0 * 64.0) + return 1; + } + + return 0; +} + diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S index 13406647..d9a442e5 100644 --- a/kernel/loongarch64/dgemm_tcopy_4_lsx.S +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U7 $vr7 PROLOGUE - push_if_used 18, 8 + push_if_used 1, 0 move S0, SRC move P0, DST @@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fst.d F0, P3, 0x00 .L_M0: - pop_if_used 18, 8 + pop_if_used 1, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_6.S b/kernel/loongarch64/dgemm_tcopy_6.S new file mode 100644 index 00000000..d3bb4a2a --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_6.S @@ -0,0 +1,555 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define T0 $r27 +#define T1 $r28 +#define T2 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define V0 $vr0 +#define V1 $vr1 +#define V2 $vr2 +#define V3 $vr3 +/* LASX vectors */ +#define U0 $xr4 +#define U1 $xr5 +#define U2 $xr6 +#define U3 $xr7 +#define U4 $xr8 +#define U5 $xr9 +#define U6 $xr10 +#define U7 $xr11 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + addi.d I, ZERO, 0x06 + div.d T0, N, I // 1 + mul.d T1, I, T0 // 6 + sub.d N, N, T1 // 1 + + srai.d T2, N, 0x02 + slli.d T2, T2, 0x02 + add.d T2, T1, T2 + + mul.d P2, M, T1 + mul.d P3, M, T2 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + + srai.d T2, N, 0x01 + srai.d J, M, 0x03 + slli.d T2, T2, 0x01 + add.d T2, T1, T2 + + add.d P2, DST, P2 + mul.d P4, M, T2 + add.d P3, DST, P3 + slli.d P4, P4, 0x03 + slli.d TL, LDA, 0x03 + add.d P4, DST, P4 + + slli.d T2, TL, 0x01 + slli.d T1, M, 0x03 + mul.d T1, T1, I + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T2 + add.d S4, S2, T2 + add.d S5, S3, T2 + add.d S6, S4, T2 + add.d S7, S5, T2 + add.d S8, S6, T2 + add.d S0, S7, T2 + + move P1, P0 + addi.d P0, P0, 0x180 + + move I, T0 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + xvld U1, S2, 0x00 + vld V1, S2, 0x20 + xvld U2, S3, 0x00 + vld V2, S3, 0x20 + xvld U3, S4, 0x00 + vld V3, S4, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + xvst U1, P1, 0x30 + vst V1, P1, 0x50 + + xvst U2, P1, 0x60 + vst V2, P1, 0x80 + + xvst U3, P1, 0x90 + vst V3, P1, 0xB0 + + xvld U0, S5, 0x00 + vld V0, S5, 0x20 + xvld U1, S6, 0x00 + vld V1, S6, 0x20 + xvld U2, S7, 0x00 + vld V2, S7, 0x20 + xvld U3, S8, 0x00 + vld V3, S8, 0x20 + + xvst U0, P1, 0xC0 + vst V0, P1, 0xE0 + + xvst U1, P1, 0xF0 + vst V1, P1, 0x110 + + xvst U2, P1, 0x120 + vst V2, P1, 0x140 + + xvst U3, P1, 0x150 + vst V3, P1, 0x170 + + addi.d S1, S1, 0x30 + addi.d S2, S2, 0x30 + addi.d S3, S3, 0x30 + addi.d S4, S4, 0x30 + addi.d S5, S5, 0x30 + addi.d S6, S6, 0x30 + addi.d S7, S7, 0x30 + addi.d S8, S8, 0x30 + addi.d I, I, -1 + + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P3, 0x00 + xvst U2, P3, 0x20 + xvst U4, P3, 0x40 + xvst U6, P3, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T2 + add.d S4, S2, T2 + add.d S0, S3, T2 + + move P1, P0 + addi.d P0, P0, 0xC0 + + move I, T0 + beq ZERO, I, .L_4N7 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + xvld U1, S2, 0x00 + vld V1, S2, 0x20 + xvld U2, S3, 0x00 + vld V2, S3, 0x20 + xvld U3, S4, 0x00 + vld V3, S4, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + xvst U1, P1, 0x30 + vst V1, P1, 0x50 + + xvst U2, P1, 0x60 + vst V2, P1, 0x80 + + xvst U3, P1, 0x90 + vst V3, P1, 0xB0 + + addi.d S1, S1, 0x30 + addi.d S2, S2, 0x30 + addi.d S3, S3, 0x30 + addi.d S4, S4, 0x30 + + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P3, 0x00 + xvst U2, P3, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T2 + + move P1, P0 + addi.d P0, P0, 0x60 + + move I, T0 + beq ZERO, I, .L_2N7 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + xvld U1, S2, 0x00 + vld V1, S2, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + xvst U1, P1, 0x30 + vst V1, P1, 0x50 + + addi.d S1, S1, 0x30 + addi.d S2, S2, 0x30 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x30 + + move I, T0 + beq ZERO, I, .L_1N7 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + addi.d S1, S1, 0x30 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P2, 0x00 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S index a7e3ef69..b4106e6a 100644 --- a/kernel/loongarch64/dgemm_tcopy_8_lsx.S +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U7 $vr7 PROLOGUE - push_if_used 24, 8 + push_if_used 7, 0 move S0, SRC move P0, DST @@ -592,6 +592,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d S1, S1, 0x08 addi.d P4, P4, 0x08 .L_M0: - pop_if_used 24, 8 + pop_if_used 7, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S index a49bf9bb..9fe4bfdd 100644 --- a/kernel/loongarch64/dgemv_n_8_lasx.S +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -509,7 +509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 24 + 4 + push_if_used 7, 4 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -549,6 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 24 + 4 + pop_if_used 7, 4 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S index 71f942b0..2c29bebe 100644 --- a/kernel/loongarch64/dgemv_t_8_lasx.S +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -445,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 24 + 3 + push_if_used 8, 3 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -476,6 +476,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP .L_END: - pop_if_used 17 + 8, 24 + 3 + pop_if_used 8, 3 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S index 0715b631..11c896cb 100644 --- a/kernel/loongarch64/dot_lasx.S +++ b/kernel/loongarch64/dot_lasx.S @@ -165,7 +165,7 @@ PROLOGUE /* store dot in s1 $f8 */ #ifdef DSDOT xvfadd.d $xr8, $xr8, $xr9 - fsub.s s2, s2, s2, /* set s2 to 0.0 */ + fsub.s s2, s2, s2 /* set s2 to 0.0 */ xvpermi.q $xr0, $xr8, 0x1 vfadd.d $vr8, $vr8, $vr0 vpackod.d $vr0, $vr8, $vr8 diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S index f7f61355..8a74d82e 100644 --- a/kernel/loongarch64/dot_lsx.S +++ b/kernel/loongarch64/dot_lsx.S @@ -165,7 +165,7 @@ PROLOGUE /* store dot in s1 $f8 */ #ifdef DSDOT vfadd.d $vr8, $vr8, $vr9 - fsub.s s2, s2, s2, /* set s2 to 0.0 */ + fsub.s s2, s2, s2 /* set s2 to 0.0 */ vpackod.d $vr0, $vr8, $vr8 vfadd.d $vr8, $vr8, $vr0 #else diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S index 3315dacc..e71fa7d3 100644 --- a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S @@ -1029,7 +1029,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 /* if (!(N >> 2)) goto L_N3 */ PTR_SRAI J, N, 2 /* J = bn >> 2 */ @@ -1361,6 +1361,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO, I, .L_N1_I1 .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S index 0e2caccc..7fc62857 100644 --- a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S @@ -128,31 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dtrsm_kernel_macro.S" -.macro ldrepl_macro start, end, stride +.macro ldrepl_macro stride:req, index:req, more:vararg // Load Ux (x = 0...15) -.if \start <= \end - GLDREPL xv, d, $xr\start, A0, \stride * 8 - ldrepl_macro %start + 1, \end, %stride + 1 + GLDREPL xv, d, $xr\index, A0, \index * 8 - \stride * 8 +.ifnb \more + ldrepl_macro \stride, \more .endif .endm -.macro nmsub_macro start0, end0, start1, reg +.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg // Gx -= reg * Ux -.if \start0 <= \end0 xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 - nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.ifnb \more + nmsub_macro \reg, \more .endif .endm -.macro B_st_macro start, end, stride, N +.macro B_st_macro N:req, stride:req, start:req, more:vararg // Store Gx(x = 16...31) -.if \start <= \end .if \N == 4 - xvst $xr\start, B0, \stride * 0x20 + xvst $xr\start, B0, \start * 0x20 - \stride * 0x20 .elseif \N == 2 - vst $vr\start, B0, \stride * 0x10 + vst $vr\start, B0, \start * 0x10 - \stride * 0x10 .elseif \N == 1 - fst.d $f\start, B0, \stride * 0x08 + fst.d $f\start, B0, \start * 0x08 - \stride * 0x08 .endif - B_st_macro %start + 1, \end, %stride + 1, \N +.ifnb \more + B_st_macro \N, \stride, \more .endif .endm @@ -194,86 +194,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 255 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 15, 0 + ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 31, 1, G0 + nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ + 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 1 - ldrepl_macro 1, 15, 0 + ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G1, G1, U1 - nmsub_macro 18, 31, 2, G1 + nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ + 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 2 - ldrepl_macro 2, 15, 0 + ldrepl_macro 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G2, G2, U2 - nmsub_macro 19, 31, 3, G2 + nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, \ + 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 3 - ldrepl_macro 3, 15, 0 + ldrepl_macro 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G3, G3, U3 - nmsub_macro 20, 31, 4, G3 + nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, \ + 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 4 - ldrepl_macro 4, 15, 0 + ldrepl_macro 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G4, G4, U4 - nmsub_macro 21, 31, 5, G4 + nmsub_macro G4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, \ + 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 5 - ldrepl_macro 5, 15, 0 + ldrepl_macro 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G5, G5, U5 - nmsub_macro 22, 31, 6, G5 + nmsub_macro G5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, \ + 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 6 - ldrepl_macro 6, 15, 0 + ldrepl_macro 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G6, G6, U6 - nmsub_macro 23, 31, 7, G6 + nmsub_macro G6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, \ + 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 7 - ldrepl_macro 7, 15, 0 + ldrepl_macro 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G7, G7, U7 - nmsub_macro 24, 31, 8, G7 + nmsub_macro G7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 8 - ldrepl_macro 8, 15, 0 + ldrepl_macro 8, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G8, G8, U8 - nmsub_macro 25, 31, 9, G8 + nmsub_macro G8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 9 - ldrepl_macro 9, 15, 0 + ldrepl_macro 9, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G9, G9, U9 - nmsub_macro 26, 31, 10, G9 + nmsub_macro G9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 10 - ldrepl_macro 10, 15, 0 + ldrepl_macro 10, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G10, G10, U10 - nmsub_macro 27, 31, 11, G10 + nmsub_macro G10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 11 - ldrepl_macro 11, 15, 0 + ldrepl_macro 11, 11, 12, 13, 14, 15 GMUL xvf, d, G11, G11, U11 - nmsub_macro 28, 31, 12, G11 + nmsub_macro G11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 12 - ldrepl_macro 12, 15, 0 + ldrepl_macro 12, 12, 13, 14, 15 GMUL xvf, d, G12, G12, U12 - nmsub_macro 29, 31, 13, G12 + nmsub_macro G12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 13 - ldrepl_macro 13, 15, 0 + ldrepl_macro 13, 13, 14, 15 GMUL xvf, d, G13, G13, U13 - nmsub_macro 30, 31, 14, G13 + nmsub_macro G13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 14 - ldrepl_macro 14, 15, 0 + ldrepl_macro 14, 14, 15 GMUL xvf, d, G14, G14, U14 - nmsub_macro 31, 31, 15, G14 + nmsub_macro G14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 15 - ldrepl_macro 15, 15, 0 + ldrepl_macro 15, 15 GMUL xvf, d, G15, G15, U15 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 31, 0, \N + B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 @@ -334,46 +341,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 63 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 7, 0 + ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 23, 1, G0 + nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 1 - ldrepl_macro 1, 7, 0 + ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7 GMUL xvf, d, G1, G1, U1 - nmsub_macro 18, 23, 2, G1 + nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 2 - ldrepl_macro 2, 7, 0 + ldrepl_macro 2, 2, 3, 4, 5, 6, 7 GMUL xvf, d, G2, G2, U2 - nmsub_macro 19, 23, 3, G2 + nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 3 - ldrepl_macro 3, 7, 0 + ldrepl_macro 3, 3, 4, 5, 6, 7 GMUL xvf, d, G3, G3, U3 - nmsub_macro 20, 23, 4, G3 + nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 4 - ldrepl_macro 4, 7, 0 + ldrepl_macro 4, 4, 5, 6, 7 GMUL xvf, d, G4, G4, U4 - nmsub_macro 21, 23, 5, G4 + nmsub_macro G4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 5 - ldrepl_macro 5, 7, 0 + ldrepl_macro 5, 5, 6, 7 GMUL xvf, d, G5, G5, U5 - nmsub_macro 22, 23, 6, G5 + nmsub_macro G5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 6 - ldrepl_macro 6, 7, 0 + ldrepl_macro 6, 6, 7 GMUL xvf, d, G6, G6, U6 - nmsub_macro 23, 23, 7, G6 + nmsub_macro G6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 7 - ldrepl_macro 7, 7, 0 + ldrepl_macro 7, 7 GMUL xvf, d, G7, G7, U7 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 23, 0, \N + B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 .if \N == 4 @@ -437,26 +444,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 15 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 3, 0 + ldrepl_macro 0, 0, 1, 2, 3 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 19, 1, G0 + nmsub_macro G0, 17, 1, 18, 2, 19, 3 PTR_ADDI A0, A0, 5 * 8 // Load 1 - ldrepl_macro 1, 3, 0 + ldrepl_macro 1, 1, 2, 3 GMUL xvf, d, G1, G1, U1 - nmsub_macro 18, 19, 2, G1 + nmsub_macro G1, 18, 2, 19, 3 PTR_ADDI A0, A0, 5 * 8 // Load 2 - ldrepl_macro 2, 3, 0 + ldrepl_macro 2, 2, 3 GMUL xvf, d, G2, G2, U2 - nmsub_macro 19, 19, 3, G2 + nmsub_macro G2, 19, 3 PTR_ADDI A0, A0, 5 * 8 // Load 3 - ldrepl_macro 3, 3, 0 + ldrepl_macro 3, 3 GMUL xvf, d, G3, G3, U3 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 19, 0, \N + B_st_macro \N, 16, 16, 17, 18, 19 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 .if \N == 4 GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 @@ -501,16 +508,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 3 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 1, 0 + ldrepl_macro 0, 0, 1 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 17, 1, G0 + nmsub_macro G0, 17, 1 PTR_ADDI A0, A0, 3 * 8 // Load 1 - ldrepl_macro 1, 1, 0 + ldrepl_macro 1, 1 GMUL xvf, d, G1, G1, U1 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 17, 0, \N + B_st_macro \N, 16, 16, 17 GSBUTTERFLY xv, d, U0, U1, G1, G0 .if \N == 4 vst $vr0, C0, 0x00 @@ -717,7 +724,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 /* if (!(N >> 2)) goto L_N3 */ PTR_SRAI J, N, 2 /* J = bn >> 2 */ @@ -954,6 +961,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADD AA, AA, T0 // aa += 1 * k .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S index 42133973..be378631 100644 --- a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S @@ -128,33 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dtrsm_kernel_macro.S" -.macro ldrepl_macro start, end, stride +.macro ldrepl_macro stride:req, index:req, more:vararg // Load Ux (x = 0...15) -.if \start <= \end - GLDREPL xv, d, $xr\start, B0, \stride * 8 - ldrepl_macro %start + 1, \end, %stride + 1 + GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 +.ifnb \more + ldrepl_macro \stride, \more .endif .endm - -.macro nmsub_macro start0, end0, start1, reg -// Ux -= reg * Dx -.if \start0 <= \end0 +.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg +// Gx -= reg * Ux xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 - nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.ifnb \more + nmsub_macro \reg, \more .endif .endm - -.macro A_st_macro start, end, stride, N -// Store Ux(x = 0...15) -.if \start <= \end +.macro A_st_macro N:req, stride:req, start:req, more:vararg +// Store Gx(x = 16...31) .if \N == 4 - xvst $xr\start, A0, \stride * 0x20 + xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 .elseif \N == 2 - vst $vr\start, A0, \stride * 0x10 + vst $vr\start, A0, \start * 0x10 - \stride * 0x10 .elseif \N == 1 - fst.d $f\start, A0, \stride * 0x08 + fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 .endif - A_st_macro %start + 1, \end, %stride + 1, \N +.ifnb \more + A_st_macro \N, \stride, \more .endif .endm @@ -167,22 +165,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 - ldrepl_macro 20, 22, 5 - nmsub_macro 4, 7, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + + nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 - ldrepl_macro 25, 25, 15 - nmsub_macro 8, 11, 0, D2 - nmsub_macro 8, 11, 4, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 8, 0, 9, 1, 10, 2, 11, 3 + nmsub_macro D5, 8, 4, 9, 5, 10, 6, 11, 7 GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 - nmsub_macro 12, 15, 0, D3 - nmsub_macro 12, 15, 4, D6 - nmsub_macro 12, 15, 8, D8 + nmsub_macro D3, 12, 0, 13, 1, 14, 2, 15, 3 + nmsub_macro D6, 12, 4, 13, 5, 14, 6, 15, 7 + nmsub_macro D8, 12, 8, 13, 9, 14, 10, 15, 11 GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 // Store A - A_st_macro 0, 15, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ @@ -197,13 +196,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 - ldrepl_macro 18, 18, 3 - nmsub_macro 4, 7, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 @@ -218,22 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0, U1, D0, U1 - ldrepl_macro 20, 22, 5 - nmsub_macro 2, 3, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 2, 0, 3, 1 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U2, D4, U2, U3, D4, U3 - ldrepl_macro 25, 25, 15 - nmsub_macro 4, 5, 0, D2 - nmsub_macro 4, 5, 2, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 4, 0, 5, 1 + nmsub_macro D5, 4, 2, 5, 3 GMUL xvf, d, U4, D7, U4, U5, D7, U5 - nmsub_macro 6, 7, 0, D3 - nmsub_macro 6, 7, 2, D6 - nmsub_macro 6, 7, 4, D8 + nmsub_macro D3, 6, 0, 7, 1 + nmsub_macro D6, 6, 2, 7, 3 + nmsub_macro D8, 6, 4, 7, 5 GMUL xvf, d, U6, D9, U6, U7, D9, U7 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20, \ @@ -248,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0, U1, D0, U1 - ldrepl_macro 18, 18, 3 - nmsub_macro 2, 3, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 2, 0, 3, 1 GMUL xvf, d, U2, D2, U2, U3, D2, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20 @@ -269,22 +268,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 20, 22, 5 - nmsub_macro 1, 1, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 1, 0 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U1, D4, U1 - ldrepl_macro 25, 25, 15 - nmsub_macro 2, 2, 0, D2 - nmsub_macro 2, 2, 1, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 2, 0 + nmsub_macro D5, 2, 1 GMUL xvf, d, U2, D7, U2 - nmsub_macro 3, 3, 0, D3 - nmsub_macro 3, 3, 1, D6 - nmsub_macro 3, 3, 2, D8 + nmsub_macro D3, 3, 0 + nmsub_macro D6, 3, 1 + nmsub_macro D8, 3, 2 GMUL xvf, d, U3, D9, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 .endm @@ -296,13 +295,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 18, 18, 3 - nmsub_macro 1, 1, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 1, 0 GMUL xvf, d, U1, D2, U1 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00 .endm @@ -316,23 +315,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 20, 22, 5 - nmsub_macro 1, 1, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 1, 0 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U1, D4, U1 - ldrepl_macro 25, 25, 15 - nmsub_macro 2, 2, 0, D2 - nmsub_macro 2, 2, 1, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 2, 0 + nmsub_macro D5, 2, 1 GMUL xvf, d, U2, D7, U2 - nmsub_macro 3, 3, 0, D3 - nmsub_macro 3, 3, 1, D6 - nmsub_macro 3, 3, 2, D8 + nmsub_macro D3, 3, 0 + nmsub_macro D6, 3, 1 + nmsub_macro D8, 3, 2 GMUL xvf, d, U3, D9, U3 // Store A - A_st_macro 0, 3, 0, 2 + A_st_macro 2, 0, 0, 1, 2, 3 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, .endm @@ -344,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 18, 18, 3 - nmsub_macro 1, 1, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 1, 0 GMUL xvf, d, U1, D2, U1 // Store A - A_st_macro 0, 1, 0, 2 + A_st_macro 2, 0, 0, 1 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 .endm @@ -364,23 +363,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 20, 22, 5 - nmsub_macro 1, 1, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 1, 0 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U1, D4, U1 - ldrepl_macro 25, 25, 15 - nmsub_macro 2, 2, 0, D2 - nmsub_macro 2, 2, 1, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 2, 0 + nmsub_macro D5, 2, 1 GMUL xvf, d, U2, D7, U2 - nmsub_macro 3, 3, 0, D3 - nmsub_macro 3, 3, 1, D6 - nmsub_macro 3, 3, 2, D8 + nmsub_macro D3, 3, 0 + nmsub_macro D6, 3, 1 + nmsub_macro D8, 3, 2 GMUL xvf, d, U3, D9, U3 // Store A - A_st_macro 0, 3, 0, 1 + A_st_macro 1, 0, 0, 1, 2, 3 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, .endm @@ -392,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 18, 18, 3 - nmsub_macro 1, 1, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 1, 0 GMUL xvf, d, U1, D2, U1 // Store A - A_st_macro 0, 1, 0, 1 + A_st_macro 1, 0, 0, 1 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 .endm @@ -582,10 +581,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvld U2, C0, 0x40 xvld U3, C0, 0x60 .L_dsolve_16x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 .endm @@ -599,10 +598,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvld U0, C0, 0x00 xvld U1, C0, 0x20 .L_dsolve_8x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20 .endm @@ -615,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Load C0 */ xvld U0, C0, 0x00 .L_dsolve_4x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 4 + A_st_macro 4, 0, 0 // Strore C GST xv, , U0, C0, 0x00 .endm @@ -631,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Load C0 */ xvld U0, C0, 0x00 .L_dsolve_2x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 2 + A_st_macro 2, 0, 0 // Strore C GST v, , $vr0, C0, 0x00 .endm @@ -647,16 +646,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Load C fld.d $f0, C0, 0x00 .L_dsolve_1x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 1 + A_st_macro 1, 0, 0 // Strore C GST f, d, $f0, C0, 0x00 .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 PTR_SUB KK, ZERO, OFFSET /* if (!(N >> 2)) goto L_N3 */ @@ -877,6 +876,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADD AA, AA, T0 // aa += 1 * k .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S index 5f86d75b..fb087752 100644 --- a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S @@ -111,33 +111,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dtrsm_kernel_macro.S" -.macro ldrepl_macro start, end, stride +.macro ldrepl_macro stride:req, index:req, more:vararg // Load Ux (x = 0...15) -.if \start <= \end - GLDREPL xv, d, $xr\start, B0, \stride * 8 - ldrepl_macro %start + 1, \end, %stride + 1 + GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 +.ifnb \more + ldrepl_macro \stride, \more .endif .endm - -.macro nmsub_macro start0, end0, start1, reg -// Ux -= reg * Dx -.if \start0 <= \end0 +.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg +// Gx -= reg * Ux xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 - nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.ifnb \more + nmsub_macro \reg, \more .endif .endm - -.macro A_st_macro start, end, stride, N -// Store Ux(x = 0...15) -.if \start <= \end +.macro A_st_macro N:req, stride:req, start:req, more:vararg +// Store Gx(x = 16...31) .if \N == 4 - xvst $xr\start, A0, \stride * 0x20 + xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 .elseif \N == 2 - vst $vr\start, A0, \stride * 0x10 + vst $vr\start, A0, \start * 0x10 - \stride * 0x10 .elseif \N == 1 - fst.d $f\start, A0, \stride * 0x08 + fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 .endif - A_st_macro %start + 1, \end, %stride + 1, \N +.ifnb \more + A_st_macro \N, \stride, \more .endif .endm @@ -148,13 +146,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 - nmsub_macro 0, 3, 4, D1 + nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 @@ -167,13 +165,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U2, D2, U2, U3, D2, U3 - nmsub_macro 0, 1, 2, D1 + nmsub_macro D1, 0, 2, 1, 3 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20 @@ -186,13 +184,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00 .endm @@ -204,13 +202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 1, 0, 2 + A_st_macro 2, 0, 0, 1 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 .endm @@ -222,13 +220,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 1, 0, 1 + A_st_macro 1, 0, 0, 1 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 .endm @@ -242,22 +240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 - ldrepl_macro 19, 21, 8 - nmsub_macro 8, 11, 12, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 - ldrepl_macro 16, 16, 0 - nmsub_macro 4, 7, 12, D7 - nmsub_macro 4, 7, 8, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15 + nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 - nmsub_macro 0, 3, 12, D6 - nmsub_macro 0, 3, 8, D3 - nmsub_macro 0, 3, 4, D1 + nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15 + nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11 + nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 15, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ @@ -274,22 +272,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U6, D9, U6, U7, D9, U7 - ldrepl_macro 19, 21, 8 - nmsub_macro 4, 5, 6, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 4, 6, 5, 7 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U4, D5, U4, U5, D5, U5 - ldrepl_macro 16, 16, 0 - nmsub_macro 2, 3, 6, D7 - nmsub_macro 2, 3, 4, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 2, 6, 3, 7 + nmsub_macro D4, 2, 4, 3, 5 GMUL xvf, d, U2, D2, U2, U3, D2, U3 - nmsub_macro 0, 1, 6, D6 - nmsub_macro 0, 1, 4, D3 - nmsub_macro 0, 1, 2, D1 + nmsub_macro D6, 0, 6, 1, 7 + nmsub_macro D3, 0, 4, 1, 5 + nmsub_macro D1, 0, 2, 1, 3 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20, \ @@ -306,22 +304,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U3, D9, U3 - ldrepl_macro 19, 21, 8 - nmsub_macro 2, 2, 3, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 2, 3 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U2, D5, U2 - ldrepl_macro 16, 16, 0 - nmsub_macro 1, 1, 3, D7 - nmsub_macro 1, 1, 2, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 1, 3 + nmsub_macro D4, 1, 2 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 3, D6 - nmsub_macro 0, 0, 2, D3 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D6, 0, 3 + nmsub_macro D3, 0, 2 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 .endm @@ -335,22 +333,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U3, D9, U3 - ldrepl_macro 19, 21, 8 - nmsub_macro 2, 2, 3, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 2, 3 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U2, D5, U2 - ldrepl_macro 16, 16, 0 - nmsub_macro 1, 1, 3, D7 - nmsub_macro 1, 1, 2, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 1, 3 + nmsub_macro D4, 1, 2 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 3, D6 - nmsub_macro 0, 0, 2, D3 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D6, 0, 3 + nmsub_macro D3, 0, 2 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 3, 0, 2 + A_st_macro 2, 0, 0, 1, 2, 3 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 .endm @@ -364,22 +362,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U3, D9, U3 - ldrepl_macro 19, 21, 8 - nmsub_macro 2, 2, 3, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 2, 3 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U2, D5, U2 - ldrepl_macro 16, 16, 0 - nmsub_macro 1, 1, 3, D7 - nmsub_macro 1, 1, 2, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 1, 3 + nmsub_macro D4, 1, 2 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 3, D6 - nmsub_macro 0, 0, 2, D3 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D6, 0, 3 + nmsub_macro D3, 0, 2 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 3, 0, 1 + A_st_macro 1, 0, 0, 1, 2, 3 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, .endm @@ -399,10 +397,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_16x1: PTR_ADDI A0, T1, -16 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 .endm @@ -420,10 +418,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_8x1: PTR_ADDI A0, T1, -8 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20 .endm @@ -440,10 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_4x1: PTR_ADDI A0, T1, -4 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 4 + A_st_macro 4, 0, 0 // Strore C GST xv, , U0, C0, 0x00 .endm @@ -460,10 +458,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_2x1: PTR_ADDI A0, T1, -2 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 2 + A_st_macro 2, 0, 0 // Strore C GST v, , $vr0, C0, 0x00 .endm @@ -480,10 +478,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_1x1: PTR_ADDI A0, T1, -1 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 1 + A_st_macro 1, 0, 0 // Strore C GST f, d, $f0, C0, 0x00 .endm @@ -697,7 +695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 PTR_SUB KK, N, OFFSET PTR_MUL T0, N, LDC @@ -948,6 +946,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI KK, KK, -4 bnez J, .L_J1 .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/gemm_ncopy_6.prefx.c b/kernel/loongarch64/gemm_ncopy_6.prefx.c new file mode 100644 index 00000000..65680d4e --- /dev/null +++ b/kernel/loongarch64/gemm_ncopy_6.prefx.c @@ -0,0 +1,299 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + BLASLONG nmod6; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6 ; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + + nmod6 = n - (n / 6)* 6 ; + aoffset = a; + boffset = b; + + // prefex A: 1 block, block size: 4*8 bytes, offset: 16*8 bytes, base: aoffset1,2,,6; + BLASULONG index = 0x100080; //( (1<<20)|(16<<3)&0xffff) ) ; + // prefex B: 1 block, block size: 24*8 bytes, offset: 96*8 bytes, base: boffset; + BLASULONG index_b = 0xb00300; //(11<<20) | ((96*8)&0xffff) ; + + j = (n / 6); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset += 6 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp13; + *(boffset + 4) = ctemp17; + *(boffset + 5) = ctemp21; + + *(boffset + 6) = ctemp02; + *(boffset + 7) = ctemp06; + *(boffset + 8) = ctemp10; + *(boffset + 9) = ctemp14; + *(boffset + 10) = ctemp18; + *(boffset + 11) = ctemp22; + + *(boffset + 12) = ctemp03; + *(boffset + 13) = ctemp07; + *(boffset + 14) = ctemp11; + *(boffset + 15) = ctemp15; + *(boffset + 16) = ctemp19; + *(boffset + 17) = ctemp23; + + *(boffset + 18) = ctemp04; + *(boffset + 19) = ctemp08; + *(boffset + 20) = ctemp12; + *(boffset + 21) = ctemp16; + *(boffset + 22) = ctemp20; + *(boffset + 23) = ctemp24; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + + boffset += 24; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + boffset += 6; + i --; + }while(i > 0); + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (nmod6 & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp06; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + boffset += 4; + } + } + + if (nmod6 & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (nmod6 & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + } + } + + return 0; +} diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S index a2fc9dbb..c22ade4b 100644 --- a/kernel/loongarch64/icamax_lsx.S +++ b/kernel/loongarch64/icamax_lsx.S @@ -308,8 +308,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w x1, t3, 3 vinsgr2vr.w x2, t4, 3 addi.d I, I, -1 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 vfmul.s x3, VI4, x1 vfmul.s x4, VI4, x2 vfcmp.clt.s VT0, x1, VI3 diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index d097b304..a2221491 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -90,57 +90,175 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_FST fst.d #endif -// The max registers available to the user which -// do not need to be preserved across calls. -// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html -#define MAX_INT_CALLER_SAVED 17 -#define MAX_FP_CALLER_SAVED 24 - .altmacro // Enable alternate macro mode +/* + * Pushing and popping static registers into/from the stack. + * regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9 + * fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8 + */ .macro push_if_used regs, fregs -.if \regs > MAX_INT_CALLER_SAVED - PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) - push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 +.if \regs > 0 + PTR_ADDI $sp, $sp, -(\regs << REG_LOG) + push_regs 0, \regs - 1 .endif -.if \fregs > MAX_FP_CALLER_SAVED - PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) - push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 +.if \fregs > 0 + PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG) + push_fregs 0, \fregs - 1 .endif .endm // End push_if_used + .macro pop_if_used regs, fregs -.if \fregs > MAX_FP_CALLER_SAVED - pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 - PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG +.if \fregs > 0 + pop_fregs 0, \fregs - 1 + PTR_ADDI $sp, $sp, \fregs << FREG_LOG .endif -.if \regs > MAX_INT_CALLER_SAVED - pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 - PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG +.if \regs > 0 + pop_regs 0, \regs - 1 + PTR_ADDI $sp, $sp, \regs << REG_LOG .endif .endm // End pop_if_used + .macro push_regs from, to - PTR_ST $s\()\from, $sp, \from << REG_LOG +#ifdef __clang__ +.if \to >= 0 + PTR_ST $s0, $sp, 0 << REG_LOG +.endif +.if \to >= 1 + PTR_ST $s1, $sp, 1 << REG_LOG +.endif +.if \to >= 2 + PTR_ST $s2, $sp, 2 << REG_LOG +.endif +.if \to >= 3 + PTR_ST $s3, $sp, 3 << REG_LOG +.endif +.if \to >= 4 + PTR_ST $s4, $sp, 4 << REG_LOG +.endif +.if \to >= 5 + PTR_ST $s5, $sp, 5 << REG_LOG +.endif +.if \to >= 6 + PTR_ST $s6, $sp, 6 << REG_LOG +.endif +.if \to >= 7 + PTR_ST $s7, $sp, 7 << REG_LOG +.endif +.if \to >= 8 + PTR_ST $s8, $sp, 8 << REG_LOG +.endif +#else + PTR_ST $s\()\from, $sp, \from << REG_LOG .if \to - \from push_regs %from + 1, \to .endif +#endif .endm // End push_regs + .macro pop_regs from, to +#ifdef __clang__ +.if \to >= 0 + PTR_LD $s0, $sp, 0 << REG_LOG +.endif +.if \to >= 1 + PTR_LD $s1, $sp, 1 << REG_LOG +.endif +.if \to >= 2 + PTR_LD $s2, $sp, 2 << REG_LOG +.endif +.if \to >= 3 + PTR_LD $s3, $sp, 3 << REG_LOG +.endif +.if \to >= 4 + PTR_LD $s4, $sp, 4 << REG_LOG +.endif +.if \to >= 5 + PTR_LD $s5, $sp, 5 << REG_LOG +.endif +.if \to >= 6 + PTR_LD $s6, $sp, 6 << REG_LOG +.endif +.if \to >= 7 + PTR_LD $s7, $sp, 7 << REG_LOG +.endif +.if \to >= 8 + PTR_LD $s8, $sp, 8 << REG_LOG +.endif +#else PTR_LD $s\()\from, $sp, \from << REG_LOG .if \to - \from pop_regs %from + 1, \to .endif +#endif .endm // End pop_regs + .macro push_fregs from, to +#ifdef __clang__ +.if \to >= 0 + PTR_FST $fs0, $sp, 0 << FREG_LOG +.endif +.if \to >= 1 + PTR_FST $fs1, $sp, 1 << FREG_LOG +.endif +.if \to >= 2 + PTR_FST $fs2, $sp, 2 << FREG_LOG +.endif +.if \to >= 3 + PTR_FST $fs3, $sp, 3 << FREG_LOG +.endif +.if \to >= 4 + PTR_FST $fs4, $sp, 4 << FREG_LOG +.endif +.if \to >= 5 + PTR_FST $fs5, $sp, 5 << FREG_LOG +.endif +.if \to >= 6 + PTR_FST $fs6, $sp, 6 << FREG_LOG +.endif +.if \to >= 7 + PTR_FST $fs7, $sp, 7 << FREG_LOG +.endif +#else PTR_FST $fs\()\from, $sp, \from << FREG_LOG .if \to - \from push_fregs %from + 1, \to .endif +#endif .endm // End push_fregs + .macro pop_fregs from, to +#ifdef __clang__ +.if \to >= 0 + PTR_FLD $fs0, $sp, 0 << FREG_LOG +.endif +.if \to >= 1 + PTR_FLD $fs1, $sp, 1 << FREG_LOG +.endif +.if \to >= 2 + PTR_FLD $fs2, $sp, 2 << FREG_LOG +.endif +.if \to >= 3 + PTR_FLD $fs3, $sp, 3 << FREG_LOG +.endif +.if \to >= 4 + PTR_FLD $fs4, $sp, 4 << FREG_LOG +.endif +.if \to >= 5 + PTR_FLD $fs5, $sp, 5 << FREG_LOG +.endif +.if \to >= 6 + PTR_FLD $fs6, $sp, 6 << FREG_LOG +.endif +.if \to >= 7 + PTR_FLD $fs7, $sp, 7 << FREG_LOG +.endif +#else PTR_FLD $fs\()\from, $sp, \from << FREG_LOG .if \to - \from pop_fregs %from + 1, \to .endif +#endif .endm // End pop_fregs // @@ -275,7 +393,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // GXOR // .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()xor.\suf_op \out, \in0, \in1 +.ifnb \pre_op + \pre_op\()xor.v \out, \in0, \in1 +.else + xor.\suf_op \out, \in0, \in1 +.endif .ifnb \more GXOR \pre_op, \suf_op, \more .endif @@ -307,6 +429,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GPRELD \more .endif .endm +// +// GPACKEV +// +.macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()packev.\suf_op \out, \in0, \in1 +.ifnb \more + GPACKEV \pre_op, \suf_op, \more +.endif +.endm +// +// GPACKOD +// +.macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()packod.\suf_op \out, \in0, \in1 +.ifnb \more + GPACKOD \pre_op, \suf_op, \more +.endif +.endm +// +// GSHUF4I +// +.macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg + \pre_op\()shuf4i.\suf_op \out, \in0, \in1 +.ifnb \more + GSHUF4I \pre_op, \suf_op, \more +.endif +.endm + +.macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg +.ifeqs "\pre_op\()\suf_op", "vfs" + \name v, w, \more +.endif +.ifeqs "\pre_op\()\suf_op", "vfd" + \name v, d, \more +.endif +.ifeqs "\pre_op\()\suf_op", "xvfs" + \name xv, w, \more +.endif +.ifeqs "\pre_op\()\suf_op", "xvfd" + \name xv, d, \more +.endif +.endm // // Compound instructions @@ -314,61 +478,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // GACC: Accumulate the values of vector registers // .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg -.ifeqs "\pre_op", "xvf" +.ifeqs "\pre_op\()\suf_op", "xvfd" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvfs" xvpermi.q \out, \in, 0x01 \pre_op\()add.\suf_op \in, \out, \in xvpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifeqs "\suf_op", "s" xvpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif +.ifeqs "\pre_op\()\suf_op", "vfd" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in .endif - -.ifeqs "\pre_op", "vf" +.ifeqs "\pre_op\()\suf_op", "vfs" vpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifeqs "\suf_op", "s" vpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif -.endif -.ifeqs "\pre_op", "xv" +.ifeqs "\pre_op\()\suf_op", "xvd" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvw" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvh" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in + xvpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvb" xvpermi.q \out, \in, 0x01 \pre_op\()add.\suf_op \in, \out, \in xvpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "d" xvpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "w" xvpackod.h \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "h" xvpackod.b \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif + +.ifeqs "\pre_op\()\suf_op", "vd" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in .endif +.ifeqs "\pre_op\()\suf_op", "vw" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in .endif +.ifeqs "\pre_op\()\suf_op", "vh" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in + vpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in .endif - -.ifeqs "\pre_op", "v" +.ifeqs "\pre_op\()\suf_op", "vb" vpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "d" vpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "w" vpackod.h \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "h" vpackod.b \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif -.endif -.endif -.endif .ifnb \more GACC \pre_op, \suf_op, \more @@ -391,27 +590,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Note: When "pre_op = xvf && suf_op = s", in will be modified. // .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg -.ifeqs "\pre_op", "xvf" +.ifeqs "\pre_op\()\suf_op", "xvfd" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \out, \out, \in +.endif + +.ifeqs "\pre_op\()\suf_op", "xvfs" xvpermi.q \out, \in, 0x01 -.ifeqs "\suf_op", "s" \pre_op\()add.\suf_op \in, \out, \in xvpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.else - \pre_op\()add.\suf_op \out, \out, \in .endif + +.ifeqs "\pre_op\()\suf_op", "vfd" + vor.v \out, \in, \in .endif -.ifeqs "\pre_op", "vf" -.ifeqs "\suf_op", "s" +.ifeqs "\pre_op\()\suf_op", "vfs" vpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.else - vor.v \out, \in, \in -.endif .endif - .ifnb \more GCOMPLEXACC \pre_op, \suf_op, \more .endif @@ -430,56 +629,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // suf_op: s or d, differentiate between single precision or double precision complex numbers // .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg -.ifeqs "\pre_op", "xvf" - xvxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - xvpackev.w \tmp0, \in0, \in0 -.else - xvpackev.d \tmp0, \in0, \in0 -.endif -.else - vxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - vpackev.w \tmp0, \in0, \in0 -.else - vpackev.d \tmp0, \in0, \in0 -.endif -.endif + TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 + TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 -.ifeqs "\pre_op", "xvf" -.ifeqs "\suf_op", "s" .ifeqs "\xconj", "0" - xvpackod.w \tmp1, \in0, \tmp1 + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 .else - xvpackod.w \tmp1, \tmp1, \in0 + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 .endif - xvshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\xconj", "0" - xvpackod.d \tmp1, \in0, \tmp1 -.else - xvpackod.d \tmp1, \tmp1, \in0 -.endif - xvshuf4i.d \tmp2, \in1, 0x0b -.endif -.else + .ifeqs "\suf_op", "s" -.ifeqs "\xconj", "0" - vpackod.w \tmp1, \in0, \tmp1 -.else - vpackod.w \tmp1, \tmp1, \in0 -.endif - vshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\xconj", "0" - vpackod.d \tmp1, \in0, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .else - vpackod.d \tmp1, \tmp1, \in0 -.endif - vshuf4i.d \tmp2, \in1, 0x0b -.endif + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif \pre_op\()mul.\suf_op \out, \tmp0, \in1 @@ -512,112 +676,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // suf_op: s or d, differentiate between single precision or double precision complex numbers // .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg -.ifeqs "\pre_op", "xvf" - xvxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - xvpackev.w \tmp0, \in0, \in0 -.else - xvpackev.d \tmp0, \in0, \in0 -.endif -.else - vxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - vpackev.w \tmp0, \in0, \in0 -.else - vpackev.d \tmp0, \in0, \in0 -.endif -.endif + TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 + TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 -.ifeqs "\conj", "1" + +.ifeqs "\conj\()\suf_op", "1s" \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 -.ifeqs "\pre_op", "xvf" -.ifeqs "\suf_op", "s" - xvshuf4i.w \tmp0, \tmp0, 0xb1 - xvpackev.w \out, \tmp0, \tmp2 -.else - xvshuf4i.d \tmp0, \tmp0, 0x0b - xvpackev.d \out, \tmp0, \tmp2 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1 + TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 .endif -.else -.ifeqs "\suf_op", "s" - vshuf4i.w \tmp0, \tmp0, 0xb1 - vpackev.w \out, \tmp0, \tmp2 -.else - vshuf4i.d \tmp0, \tmp0, 0x0b - vpackev.d \out, \tmp0, \tmp2 +.ifeqs "\conj\()\suf_op", "1d" + \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b + TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 .endif -.endif /* pre_op = xvf */ -.else +.ifeqs "\conj", "0" \pre_op\()add.\suf_op \out, \tmp2, \tmp1 -.endif /* conj = 1 */ +.endif \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 -.ifeqs "\pre_op", "xvf" -.ifeqs "\suf_op", "s" -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - xvpackod.w \tmp1, \in0, \tmp1 -.else - xvpackod.w \tmp1, \tmp1, \in0 +.ifeqs "\xconj\()\conj\()\suf_op", "00s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif -.else -.ifeqs "\xconj", "0" - xvpackod.w \tmp1, \in0, \in0 -.else - xvpackod.w \tmp1, \tmp1, \tmp1 +.ifeqs "\xconj\()\conj\()\suf_op", "10s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif +.ifeqs "\xconj\()\conj\()\suf_op", "01s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif - xvshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - xvpackod.d \tmp1, \in0, \tmp1 -.else - xvpackod.d \tmp1, \tmp1, \in0 -.endif -.else -.ifeqs "\xconj", "0" - xvpackod.d \tmp1, \in0, \in0 -.else - xvpackod.d \tmp1, \tmp1, \tmp1 -.endif -.endif - xvshuf4i.d \tmp2, \in1, 0x0b -.endif -.else -.ifeqs "\suf_op", "s" -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - vpackod.w \tmp1, \in0, \tmp1 -.else - vpackod.w \tmp1, \tmp1, \in0 +.ifeqs "\xconj\()\conj\()\suf_op", "11s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif -.else -.ifeqs "\xconj", "0" - vpackod.w \tmp1, \in0, \in0 -.else - vpackod.w \tmp1, \tmp1, \tmp1 -.endif -.endif - vshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - vpackod.d \tmp1, \in0, \tmp1 -.else - vpackod.d \tmp1, \tmp1, \in0 -.endif -.else -.ifeqs "\xconj", "0" - vpackod.d \tmp1, \in0, \in0 -.else - vpackod.d \tmp1, \tmp1, \tmp1 +.ifeqs "\xconj\()\conj\()\suf_op", "00d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif +.ifeqs "\xconj\()\conj\()\suf_op", "10d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif - vshuf4i.d \tmp2, \in1, 0x0b +.ifeqs "\xconj\()\conj\()\suf_op", "01d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif +.ifeqs "\xconj\()\conj\()\suf_op", "11d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S index 566bce6c..431a44c1 100644 --- a/kernel/loongarch64/scal.S +++ b/kernel/loongarch64/scal.S @@ -53,11 +53,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE li.d TEMP, SIZE + ld.d XX, $sp, 0 // Load dummy2 + slli.d XX, XX, BASE_SHIFT MTC a1, $r0 slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 CMPEQ $fcc0, ALPHA, a1 bceqz $fcc0, .L50 + beq XX, TEMP, .L50 // if dummp2 == 1, do not directly copy 0 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L15 diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S index 48e2c071..dd69636e 100644 --- a/kernel/loongarch64/scal_lasx.S +++ b/kernel/loongarch64/scal_lasx.S @@ -52,18 +52,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 + ld.d t1, $sp, 0 // Load dummp2 movgr2fr.d a1, $r0 FFINT a1, a1 movgr2fr.d a2, TEMP FFINT a2, a2 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT + slli.d t1, t1, BASE_SHIFT CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L20 //ALPHA==0 CMPEQ $fcc0, ALPHA, a2 bcnez $fcc0, .L999 //ALPHA==1 return +.L1: srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + beq INCX, TEMP, .L30 //ALPHA !=0|1 and INCX==1 MTG TEMP, ALPHA #ifdef DOUBLE xvreplgr2vr.d VALPHA, TEMP @@ -73,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .align 3 -.L10: //ALPHA!=0|1 and INCX!=1 +.L10: //ALPHA !=0|1 and INCX!=1 bge $r0, I, .L32 .align 3 .L11: @@ -168,6 +171,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L20: + beq t1, TEMP, .L1 // if dummp2 == 1, do not directly copy 0 srai.d I, N, 3 beq INCX, TEMP, .L24 bge $r0, I, .L22 diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S index 1ffce7db..57dc5d0d 100644 --- a/kernel/loongarch64/scal_lsx.S +++ b/kernel/loongarch64/scal_lsx.S @@ -51,6 +51,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d t1, $sp, 0 // Load dummy2 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -58,12 +59,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FFINT a2, a2 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT + slli.d t1, t1, BASE_SHIFT CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L20 //ALPHA==0 CMPEQ $fcc0, ALPHA, a2 bcnez $fcc0, .L999 //ALPHA==1 return +.L1: srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + beq INCX, TEMP, .L30 //ALPHA !=0|1 and INCX==1 MTG TEMP, ALPHA #ifdef DOUBLE vreplgr2vr.d VALPHA, TEMP @@ -73,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .align 3 -.L10: //ALPHA!=0|1 and INCX!=1 +.L10: //ALPHA !=0|1 and INCX!=1 bge $r0, I, .L32 .align 3 @@ -172,6 +175,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L20: + beq t1, TEMP, .L1 // if dummp2 == 1, do not directly copy 0 srai.d I, N, 3 beq INCX, TEMP, .L24 bge $r0, I, .L22 diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S index bd609394..c6d1aeae 100644 --- a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S +++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S @@ -837,7 +837,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 xvreplve0.w VALPHA, $xr0 #if defined (TRMMKERNEL) && !defined(LEFT) PTR_SUB OFF, ZERO, OFFSET @@ -2343,6 +2343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif // #if defined(TRMMKERNEL) .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S index 266c07c5..1a81ce60 100644 --- a/kernel/loongarch64/sgemm_ncopy_16_lasx.S +++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_N0 PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 move TD, DST move TS, SRC @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI M, M, -1 blt ZERO, M, .L_N1_M1 .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S index 5c173568..db36827d 100644 --- a/kernel/loongarch64/sgemm_ncopy_8_lasx.S +++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_N0 PROLOGUE - push_if_used 17, 20 + push_if_used 0, 0 move TD, DST move TS, SRC @@ -293,6 +293,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI M, M, -1 blt ZERO, M, .L_N1_M1 .L_N0: - pop_if_used 17, 20 + pop_if_used 0, 0 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S index d9789bdc..fc42ae8c 100644 --- a/kernel/loongarch64/sgemm_tcopy_16_lasx.S +++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_M0 PROLOGUE - push_if_used 24, 8 + push_if_used 7, 0 move S0, SRC move P0, DST @@ -521,6 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI S1, S1, 0x04 PTR_ADDI P5, P5, 0x04 .L_M0: - pop_if_used 24, 8 + pop_if_used 7, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S index 725a47a6..73d08fb8 100644 --- a/kernel/loongarch64/sgemm_tcopy_8_lasx.S +++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_M0 PROLOGUE - push_if_used 23, 8 + push_if_used 6, 0 move S0, SRC move P0, DST @@ -401,6 +401,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI S1, S1, 0x04 PTR_ADDI P4, P4, 0x04 .L_M0: - pop_if_used 23, 8 + pop_if_used 6, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S index 52ffc320..8648c221 100644 --- a/kernel/loongarch64/sgemv_n_8_lasx.S +++ b/kernel/loongarch64/sgemv_n_8_lasx.S @@ -418,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 19 + push_if_used 7, 0 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 19 + pop_if_used 7, 0 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S index f4bfffb4..1f843cad 100644 --- a/kernel/loongarch64/sgemv_t_8_lasx.S +++ b/kernel/loongarch64/sgemv_t_8_lasx.S @@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 18 + push_if_used 8, 0 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -400,6 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP .L_END: - pop_if_used 17 + 8, 18 + pop_if_used 8, 0 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LN_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_LN_UNROLLN6.c new file mode 100644 index 00000000..5e25a5e3 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LN_UNROLLN6.c @@ -0,0 +1,342 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + // j = (GEMM_UNROLL_N >> 1); + j = 4; + while (j > 0) { + if (nmodN & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/loongarch64/trsm_kernel_LT_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_LT_UNROLLN6.c new file mode 100644 index 00000000..2106c88c --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LT_UNROLLN6.c @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + // j = (GEMM_UNROLL_N >> 1); + j = 4; + + while (j > 0) { + if (nmodN & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/loongarch64/trsm_kernel_RN_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_RN_UNROLLN6.c new file mode 100644 index 00000000..42d5155c --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RN_UNROLLN6.c @@ -0,0 +1,325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + // j = (GEMM_UNROLL_N >> 1); + j = 4; + + while (j > 0) { + if (nmodN & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/loongarch64/trsm_kernel_RT_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_RT_UNROLLN6.c new file mode 100644 index 00000000..7424ad57 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RT_UNROLLN6.c @@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (nmodN & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/loongarch64/zgemv_n_2_lsx.S b/kernel/loongarch64/zgemv_n_2_lsx.S index efb37611..d6815400 100644 --- a/kernel/loongarch64/zgemv_n_2_lsx.S +++ b/kernel/loongarch64/zgemv_n_2_lsx.S @@ -253,7 +253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -291,6 +291,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S index 26edf1ed..2e0e0a06 100644 --- a/kernel/loongarch64/zgemv_n_4_lasx.S +++ b/kernel/loongarch64/zgemv_n_4_lasx.S @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -337,7 +337,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/zgemv_t_2_lsx.S b/kernel/loongarch64/zgemv_t_2_lsx.S index 2a0fc172..cae2a0ce 100644 --- a/kernel/loongarch64/zgemv_t_2_lsx.S +++ b/kernel/loongarch64/zgemv_t_2_lsx.S @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -263,6 +263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ ZGEMV_T_LSX GAP_1, X2_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/zgemv_t_4_lasx.S b/kernel/loongarch64/zgemv_t_4_lasx.S index 4d33b8f9..50dd73ad 100644 --- a/kernel/loongarch64/zgemv_t_4_lasx.S +++ b/kernel/loongarch64/zgemv_t_4_lasx.S @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -294,6 +294,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ ZGEMV_T_LASX GAP_1, X4_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c index 800667b6..f05c42bd 100644 --- a/kernel/mips/cgemv_t_msa.c +++ b/kernel/mips/cgemv_t_msa.c @@ -31,27 +31,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP0 #undef OP1 #undef OP2 +#undef OP3 +#undef OP4 +#undef OP5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + #define OP0 -= + #define OP1 += + #define OP2 += +#else + #define OP0 += + #define OP1 += + #define OP2 -= +#endif -#if !defined(CONJ) - #if !defined(XCONJ) - #define OP0 -= - #define OP1 += - #define OP2 += - #else - #define OP0 += - #define OP1 += - #define OP2 -= - #endif +#if !defined(XCONJ) + #define OP3 -= + #define OP4 += + #define OP5 += #else - #if !defined(XCONJ) - #define OP0 += - #define OP1 -= - #define OP2 += - #else - #define OP0 -= - #define OP1 -= - #define OP2 -= - #endif + #define OP3 += + #define OP4 -= + #define OP5 += #endif #define CGEMV_T_8x4() \ @@ -268,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. res3i = y[3 * inc_y2 + 1]; \ \ res0r += alphar * temp0r; \ - res0r OP0 alphai * temp0i; \ + res0r OP3 alphai * temp0i; \ res1r += alphar * temp1r; \ - res1r OP0 alphai * temp1i; \ + res1r OP3 alphai * temp1i; \ res2r += alphar * temp2r; \ - res2r OP0 alphai * temp2i; \ + res2r OP3 alphai * temp2i; \ res3r += alphar * temp3r; \ - res3r OP0 alphai * temp3i; \ + res3r OP3 alphai * temp3i; \ \ - res0i OP1 alphar * temp0i; \ - res0i OP2 alphai * temp0r; \ - res1i OP1 alphar * temp1i; \ - res1i OP2 alphai * temp1r; \ - res2i OP1 alphar * temp2i; \ - res2i OP2 alphai * temp2r; \ - res3i OP1 alphar * temp3i; \ - res3i OP2 alphai * temp3r; \ + res0i OP4 alphar * temp0i; \ + res0i OP5 alphai * temp0r; \ + res1i OP4 alphar * temp1i; \ + res1i OP5 alphai * temp1r; \ + res2i OP4 alphar * temp2i; \ + res2i OP5 alphai * temp2r; \ + res3i OP4 alphar * temp3i; \ + res3i OP5 alphai * temp3r; \ \ y[0 * inc_y2] = res0r; \ y[1 * inc_y2] = res1r; \ @@ -303,14 +304,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. res1i = y[1 * inc_y2 + 1]; \ \ res0r += alphar * temp0r; \ - res0r OP0 alphai * temp0i; \ + res0r OP3 alphai * temp0i; \ res1r += alphar * temp1r; \ - res1r OP0 alphai * temp1i; \ + res1r OP3 alphai * temp1i; \ \ - res0i OP1 alphar * temp0i; \ - res0i OP2 alphai * temp0r; \ - res1i OP1 alphar * temp1i; \ - res1i OP2 alphai * temp1r; \ + res0i OP4 alphar * temp0i; \ + res0i OP5 alphai * temp0r; \ + res1i OP4 alphar * temp1i; \ + res1i OP5 alphai * temp1r; \ \ y[0 * inc_y2] = res0r; \ y[1 * inc_y2] = res1r; \ @@ -324,10 +325,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. res0i = y[0 * inc_y2 + 1]; \ \ res0r += alphar * temp0r; \ - res0r OP0 alphai * temp0i; \ + res0r OP3 alphai * temp0i; \ \ - res0i OP1 alphar * temp0i; \ - res0i OP2 alphai * temp0r; \ + res0i OP4 alphar * temp0i; \ + res0i OP5 alphai * temp0r; \ \ y[0 * inc_y2] = res0r; \ y[0 * inc_y2 + 1] = res0i; \ diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c index 2e41d8be..cc8d8344 100644 --- a/kernel/mips/dscal_msa.c +++ b/kernel/mips/dscal_msa.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, if (1 == inc_x) { - if (0.0 == da) + if (0.0 == da && !dummy2) { v2f64 zero_v = {0.0, 0.0}; @@ -240,7 +240,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, } else { - if (da == 0.0) + if (da == 0.0 && !dummy2) { for (i = n; i--;) { diff --git a/kernel/mips/scal.c b/kernel/mips/scal.c index 01f708b1..5f12d427 100644 --- a/kernel/mips/scal.c +++ b/kernel/mips/scal.c @@ -29,22 +29,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i=0,j=0; - - while(j < n) - { - - if ( da == 0.0 ) - x[i]=0.0; - else - x[i] = da * x[i] ; - - i += inc_x ; - j++; - - } - return 0; - + BLASLONG i = 0, j = 0; + + // Resolved issue 4728 when the caller is {s/d}scal + if (da == 0.0 && dummy2 == 1) + { + while(j < n) + { + x[i] = da * x[i] ; + + i += inc_x ; + j++; + } + } + else + { + while(j < n) + { + + if ( da == 0.0 ) + x[i] = 0.0; + else + x[i] = da * x[i] ; + + i += inc_x ; + j++; + } + } + return 0; } diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c index 66e17b84..953cf0fb 100644 --- a/kernel/mips/sscal_msa.c +++ b/kernel/mips/sscal_msa.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, if (1 == inc_x) { - if (0.0 == da) + if (0.0 == da && !dummy2) { v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; @@ -255,7 +255,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, } else { - if (0.0 == da) + if (0.0 == da && !dummy2) { for (i = n; i--;) { diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c index 6492f90b..022bc601 100644 --- a/kernel/mips/zgemv_t_msa.c +++ b/kernel/mips/zgemv_t_msa.c @@ -33,27 +33,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP2 #undef OP3 #undef OP4 +#undef OP5 -#if !defined(CONJ) - #if !defined(XCONJ) - #define OP0 -= - #define OP1 += - #define OP2 += - #else - #define OP0 += - #define OP1 += - #define OP2 -= - #endif +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + #define OP0 -= + #define OP1 += + #define OP2 += #else - #if !defined(XCONJ) - #define OP0 += - #define OP1 -= - #define OP2 += - #else - #define OP0 -= - #define OP1 -= - #define OP2 -= - #endif + #define OP0 += + #define OP1 += + #define OP2 -= +#endif + +#if !defined(XCONJ) + #define OP3 -= + #define OP4 += + #define OP5 += +#else + #define OP3 += + #define OP4 -= + #define OP5 += #endif #define ZGEMV_T_8x1() \ @@ -124,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. res0i = y[0 * inc_y2 + 1]; \ \ res0r += alphar * temp0r; \ - res0r OP0 alphai * temp0i; \ + res0r OP3 alphai * temp0i; \ \ - res0i OP1 alphar * temp0i; \ - res0i OP2 alphai * temp0r; \ + res0i OP4 alphar * temp0i; \ + res0i OP5 alphai * temp0r; \ \ y[0 * inc_y2] = res0r; \ y[0 * inc_y2 + 1] = res0i; \ diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index 7bb26194..ae1c87fc 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -48,7 +48,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F { temp = - da_i * x[ip+1] ; if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; - x[ip+1] = da_i * x[ip] ; + if (!isinf(x[ip+1])) + x[ip+1] = da_i * x[ip] ; + else x[ip+1] = NAN; } } else @@ -56,12 +58,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F if ( da_i == 0.0 ) { temp = da_r * x[ip] ; - x[ip+1] = da_r * x[ip+1]; + if (!isinf(x[ip+1])) + x[ip+1] = da_r * x[ip+1]; + else x[ip+1] = NAN; } else { temp = da_r * x[ip] - da_i * x[ip+1] ; - x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + if (!isinf(x[ip+1])) + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + else x[ip+1] = NAN; } } if ( da_r != da_r ) diff --git a/kernel/mips64/asum.S b/kernel/mips64/asum.S index 2bf95c65..4e75529f 100644 --- a/kernel/mips64/asum.S +++ b/kernel/mips64/asum.S @@ -73,6 +73,7 @@ MTC $0, s1 MTC $0, s2 + blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT blez N, .L999 diff --git a/kernel/mips64/scal.S b/kernel/mips64/scal.S index b28b8a30..49716d3e 100644 --- a/kernel/mips64/scal.S +++ b/kernel/mips64/scal.S @@ -48,6 +48,7 @@ #define TEMP $3 #define XX $5 +#define DUMMY2 $6 #define ALPHA $f15 @@ -73,10 +74,13 @@ blez N, .L999 dsll INCX, INCX, BASE_SHIFT - CMPEQ $fcc0, ALPHA, a1 - NOP + CMPEQ $fcc0, ALPHA, a1 + LDARG DUMMY2, 8($sp) bc1f $fcc0, .L50 + dsll DUMMY2, DUMMY2, BASE_SHIFT + + beq DUMMY2, TEMP, .L50 // If dummy2 == 1, do not directly copy 0 NOP bne INCX, TEMP, .L20 diff --git a/kernel/mips64/sum.S b/kernel/mips64/sum.S index 261630d4..725346ff 100644 --- a/kernel/mips64/sum.S +++ b/kernel/mips64/sum.S @@ -73,6 +73,7 @@ MTC $0, s1 MTC $0, s2 + blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT blez N, .L999 diff --git a/kernel/mips64/zasum.S b/kernel/mips64/zasum.S index cd22f984..a7af6918 100644 --- a/kernel/mips64/zasum.S +++ b/kernel/mips64/zasum.S @@ -73,6 +73,7 @@ MTC $0, s1 MTC $0, s2 + blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT blez N, .L999 diff --git a/kernel/mips64/zsum.S b/kernel/mips64/zsum.S index 129b9790..83c53931 100644 --- a/kernel/mips64/zsum.S +++ b/kernel/mips64/zsum.S @@ -73,6 +73,7 @@ MTC $0, s1 MTC $0, s2 + blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT blez N, .L999 diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c index 2a44a9e3..f71b6f98 100644 --- a/kernel/power/cswap_microk_power10.c +++ b/kernel/power/cswap_microk_power10.c @@ -58,6 +58,16 @@ static void cswap_kernel_32 (long n, float *x, float *y) "lxvp 62, 224(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -66,9 +76,20 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 36, 80(%3) \n\t" "stxv 39, 96(%3) \n\t" "stxv 38, 112(%3) \n\t" +#endif "addi %3, %3, 128 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 0(%3) \n\t" + "stxv 41, 16(%3) \n\t" + "stxv 42, 32(%3) \n\t" + "stxv 43, 48(%3) \n\t" + "stxv 44, 64(%3) \n\t" + "stxv 45, 80(%3) \n\t" + "stxv 46, 96(%3) \n\t" + "stxv 47, 112(%3) \n\t" +#else "stxv 41, 0(%3) \n\t" "stxv 40, 16(%3) \n\t" "stxv 43, 32(%3) \n\t" @@ -77,9 +98,20 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 44, 80(%3) \n\t" "stxv 47, 96(%3) \n\t" "stxv 46, 112(%3) \n\t" +#endif "addi %3, %3, 128 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 52, 64(%4) \n\t" + "stxv 53, 80(%4) \n\t" + "stxv 54, 96(%4) \n\t" + "stxv 55, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -88,9 +120,20 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 52, 80(%4) \n\t" "stxv 55, 96(%4) \n\t" "stxv 54, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 56, 0(%4) \n\t" + "stxv 57, 16(%4) \n\t" + "stxv 58, 32(%4) \n\t" + "stxv 59, 48(%4) \n\t" + "stxv 60, 64(%4) \n\t" + "stxv 61, 80(%4) \n\t" + "stxv 62, 96(%4) \n\t" + "stxv 63, 112(%4) \n\t" +#else "stxv 57, 0(%4) \n\t" "stxv 56, 16(%4) \n\t" "stxv 59, 32(%4) \n\t" @@ -99,6 +142,7 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 60, 80(%4) \n\t" "stxv 63, 96(%4) \n\t" "stxv 62, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 96c4e51b..da808397 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -73,16 +73,50 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x) for( i=0; i= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + if (dummy2 == 0) for (j = 0; j < align; j++) { - x[j] = 0.0; + x [j] = 0.0; + } + else + for (j = 0; j < align; j++) { + if (isfinite(x[j])) + x[j] = 0.0; + else + x[j] = NAN; } } BLASLONG n1 = (n-j) & -16; @@ -124,13 +166,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS j=n1; } #endif - + if (dummy2 == 0) while(j < n) { - x[j]=0.0; j++; } + else + while(j < n) + { + if (!isfinite(x[j])) + x[j]=NAN; + else + x[j]=0.0; + j++; + } } else @@ -173,11 +223,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - + if (dummy2 == 0) while(j < n) { - x[i]=0.0; + i += inc_x; + j++; + } + else + while(j < n) + { + if (!isfinite(x[i])) + x[i]=NAN; + else + x[i]=0.0; i += inc_x ; j++; } diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index b274f765..47771faf 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -864,15 +864,15 @@ LL(22): LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) - FMADD f2, f18, f24, f2 - FMADD f3, f19, f24, f3 - FMADD f6, f18, f25, f6 - FMADD f7, f19, f25, f7 + FMADD f0, f18, f24, f0 + FMADD f1, f19, f24, f1 + FMADD f4, f18, f25, f4 + FMADD f5, f19, f25, f5 - FMADD f10, f18, f26, f10 - FMADD f11, f19, f26, f11 - FMADD f14, f18, f27, f14 - FMADD f15, f19, f27, f15 + FMADD f8, f18, f26, f8 + FMADD f9, f19, f26, f9 + FMADD f12, f18, f27, f12 + FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -899,15 +899,15 @@ LL(22): LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) - FMADD f2, f18, f24, f2 - FMADD f3, f19, f24, f3 - FMADD f6, f18, f25, f6 - FMADD f7, f19, f25, f7 + FMADD f0, f18, f24, f0 + FMADD f1, f19, f24, f1 + FMADD f4, f18, f25, f4 + FMADD f5, f19, f25, f5 - FMADD f10, f18, f26, f10 - FMADD f11, f19, f26, f11 - FMADD f14, f18, f27, f14 - FMADD f15, f19, f27, f15 + FMADD f8, f18, f26, f8 + FMADD f9, f19, f26, f9 + FMADD f12, f18, f27, f12 + FMADD f13, f19, f27, f13 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -923,14 +923,6 @@ LL(22): addi BO, BO, 16 * SIZE bdnz LL(22) - fadd f0, f2, f0 - fadd f1, f3, f1 - fadd f4, f6, f4 - fadd f5, f7, f5 - fadd f8, f10, f8 - fadd f9, f11, f9 - fadd f12, f14, f12 - fadd f13, f15, f13 .align 4 LL(25): @@ -1161,10 +1153,10 @@ LL(32): LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) - FMADD f1, f17, f24, f1 - FMADD f5, f17, f25, f5 - FMADD f9, f17, f26, f9 - FMADD f13, f17, f27, f13 + FMADD f0, f17, f24, f0 + FMADD f4, f17, f25, f4 + FMADD f8, f17, f26, f8 + FMADD f12, f17, f27, f12 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) @@ -1181,10 +1173,10 @@ LL(32): LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) - FMADD f1, f19, f24, f1 - FMADD f5, f19, f25, f5 - FMADD f9, f19, f26, f9 - FMADD f13, f19, f27, f13 + FMADD f0, f19, f24, f0 + FMADD f4, f19, f25, f4 + FMADD f8, f19, f26, f8 + FMADD f12, f19, f27, f12 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -1200,10 +1192,6 @@ LL(32): addi BO, BO, 16 * SIZE bdnz LL(32) - fadd f0, f1, f0 - fadd f4, f5, f4 - fadd f8, f9, f8 - fadd f12, f13, f12 .align 4 LL(35): @@ -1691,10 +1679,10 @@ LL(52): FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 - FMADD f4, f18, f22, f4 - FMADD f5, f19, f22, f5 - FMADD f6, f18, f23, f6 - FMADD f7, f19, f23, f7 + FMADD f0, f18, f22, f0 + FMADD f1, f19, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -1711,10 +1699,10 @@ LL(52): FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 - FMADD f4, f18, f26, f4 - FMADD f5, f19, f26, f5 - FMADD f6, f18, f27, f6 - FMADD f7, f19, f27, f7 + FMADD f0, f18, f26, f0 + FMADD f1, f19, f26, f1 + FMADD f2, f18, f27, f2 + FMADD f3, f19, f27, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -1775,21 +1763,11 @@ LL(58): LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) - FADD f0, f4, f0 - FADD f1, f5, f1 - FADD f2, f6, f2 - FADD f3, f7, f3 - FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else - FADD f0, f4, f0 - FADD f1, f5, f1 - FADD f2, f6, f2 - FADD f3, f7, f3 - FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 @@ -1916,8 +1894,8 @@ LL(60): LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 - FMADD f2, f17, f22, f2 - FMADD f3, f17, f23, f3 + FMADD f0, f17, f22, f0 + FMADD f1, f17, f23, f1 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1926,8 +1904,8 @@ LL(62): FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 - FMADD f2, f19, f26, f2 - FMADD f3, f19, f27, f3 + FMADD f0, f19, f26, f0 + FMADD f1, f19, f27, f1 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -1986,15 +1964,9 @@ LL(68): LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) - FADD f0, f2, f0 - FADD f1, f3, f1 - FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f18 #else - FADD f0, f2, f0 - FADD f1, f3, f1 - FMUL f0, f0, f30 FMUL f1, f1, f30 #endif @@ -2007,7 +1979,6 @@ LL(68): fmr f4, f0 fmr f5, f0 - #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -2332,8 +2303,8 @@ LL(80): LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 - FMADD f2, f18, f21, f2 - FMADD f3, f19, f21, f3 + FMADD f0, f18, f21, f0 + FMADD f1, f19, f21, f1 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2342,8 +2313,8 @@ LL(82): FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 - FMADD f2, f18, f23, f2 - FMADD f3, f19, f23, f3 + FMADD f0, f18, f23, f0 + FMADD f1, f19, f23, f1 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2401,15 +2372,9 @@ LL(88): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) - FADD f0, f2, f0 - FADD f1, f3, f1 - FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 #else - FADD f0, f2, f0 - FADD f1, f3, f1 - FMUL f0, f0, f30 FMUL f1, f1, f30 #endif @@ -2418,9 +2383,6 @@ LL(88): STFD f1, 1 * SIZE(CO1) lfs f0, FZERO - fmr f1, f0 - fmr f2, f0 - fmr f3, f0 addi CO1, CO1, 2 * SIZE @@ -2512,9 +2474,9 @@ LL(90): LL(92): FMADD f0, f16, f20, f0 - FMADD f1, f17, f21, f1 - FMADD f2, f18, f22, f2 - FMADD f3, f19, f23, f3 + FMADD f0, f17, f21, f0 + FMADD f0, f18, f22, f0 + FMADD f0, f19, f23, f0 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2527,9 +2489,9 @@ LL(92): LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 - FMADD f1, f17, f21, f1 - FMADD f2, f18, f22, f2 - FMADD f3, f19, f23, f3 + FMADD f0, f17, f21, f0 + FMADD f0, f18, f22, f0 + FMADD f0, f19, f23, f0 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2583,16 +2545,8 @@ LL(98): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) - FADD f0, f1, f0 - FADD f2, f3, f2 - FADD f0, f2, f0 - FMADD f0, f0, f30, f16 #else - FADD f0, f1, f0 - FADD f2, f3, f2 - FADD f0, f2, f0 - FMUL f0, f0, f30 #endif diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index accdad70..0cabe89e 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -409,14 +409,6 @@ LL(11): fmr y06, y01 fmr y07, y01 fmr y08, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 - fmr y13, y01 - fmr y14, y01 - fmr y15, y01 - fmr y16, y01 DCBT(Y1, PREC) @@ -465,24 +457,24 @@ LL(12): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -505,24 +497,24 @@ LL(12): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 5 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 5 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 5 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 5 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 5 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 5 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 5 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 5 * SIZE(AO8) LFD b1, 9 * SIZE(BO) @@ -550,24 +542,24 @@ LL(12): FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 7 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 7 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 7 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 7 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 7 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 7 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 7 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -590,24 +582,24 @@ LL(12): FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 LFD a1, 9 * SIZE(AO1) - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 LFD a2, 9 * SIZE(AO2) - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 LFD a3, 9 * SIZE(AO3) - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 LFD a4, 9 * SIZE(AO4) - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 LFD a5, 9 * SIZE(AO5) - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 LFD a6, 9 * SIZE(AO6) - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 LFD a7, 9 * SIZE(AO7) - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 LFD a8, 9 * SIZE(AO8) LFD b5, 13 * SIZE(BO) @@ -640,24 +632,24 @@ LL(12): FMADD y08, a8, b1, y08 LFD a8, 10 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 11 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 11 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 11 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 11 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 11 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 11 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 11 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 11 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -680,24 +672,24 @@ LL(12): FMADD y08, a8, b3, y08 LFD a8, 12 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 13 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 13 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 13 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 13 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 13 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 13 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 13 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 13 * SIZE(AO8) LFD b1, 17 * SIZE(BO) @@ -725,24 +717,24 @@ LL(12): FMADD y08, a8, b5, y08 LFD a8, 14 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 15 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 15 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 15 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 15 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 15 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 15 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 15 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 15 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -765,14 +757,14 @@ LL(12): FMADD y08, a8, b7, y08 LFD a8, 16 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 LFD a1, 17 * SIZE(AO1) - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 LFD a2, 17 * SIZE(AO2) - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 LFD a3, 17 * SIZE(AO3) - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 LFD a4, 17 * SIZE(AO4) addi AO1, AO1, 16 * SIZE @@ -780,14 +772,14 @@ LL(12): addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 LFD a5, 17 * SIZE(AO5) - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 LFD a6, 17 * SIZE(AO6) - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 LFD a7, 17 * SIZE(AO7) - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 LFD a8, 17 * SIZE(AO8) LFD b5, 21 * SIZE(BO) @@ -830,24 +822,24 @@ LL(13): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -870,24 +862,24 @@ LL(13): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 5 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 5 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 5 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 5 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 5 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 5 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 5 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 5 * SIZE(AO8) LFD b1, 9 * SIZE(BO) @@ -915,24 +907,24 @@ LL(13): FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 7 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 7 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 7 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 7 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 7 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 7 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 7 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -955,24 +947,24 @@ LL(13): FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 LFD a1, 9 * SIZE(AO1) - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 LFD a2, 9 * SIZE(AO2) - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 LFD a3, 9 * SIZE(AO3) - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 LFD a4, 9 * SIZE(AO4) - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 LFD a5, 9 * SIZE(AO5) - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 LFD a6, 9 * SIZE(AO6) - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 LFD a7, 9 * SIZE(AO7) - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 LFD a8, 9 * SIZE(AO8) LFD b5, 13 * SIZE(BO) @@ -1000,24 +992,24 @@ LL(13): FMADD y08, a8, b1, y08 LFD a8, 10 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 11 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 11 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 11 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 11 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 11 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 11 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 11 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 11 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -1040,24 +1032,24 @@ LL(13): FMADD y08, a8, b3, y08 LFD a8, 12 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 13 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 13 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 13 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 13 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 13 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 13 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 13 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 13 * SIZE(AO8) FMADD y01, a1, b5, y01 @@ -1080,24 +1072,24 @@ LL(13): FMADD y08, a8, b5, y08 LFD a8, 14 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 15 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 15 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 15 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 15 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 15 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 15 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 15 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 15 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -1120,20 +1112,20 @@ LL(13): FMADD y08, a8, b7, y08 LFD a8, 16 * SIZE(AO8) - FMADD y09, a1, b8, y09 - FMADD y10, a2, b8, y10 - FMADD y11, a3, b8, y11 - FMADD y12, a4, b8, y12 + FMADD y01, a1, b8, y01 + FMADD y02, a2, b8, y02 + FMADD y03, a3, b8, y03 + FMADD y04, a4, b8, y04 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE - FMADD y13, a5, b8, y13 - FMADD y14, a6, b8, y14 - FMADD y15, a7, b8, y15 - FMADD y16, a8, b8, y16 + FMADD y05, a5, b8, y05 + FMADD y06, a6, b8, y06 + FMADD y07, a7, b8, y07 + FMADD y08, a8, b8, y08 addi AO5, AO5, 16 * SIZE addi AO6, AO6, 16 * SIZE @@ -1180,21 +1172,21 @@ LL(14): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) LFD b5, 5 * SIZE(BO) @@ -1219,21 +1211,21 @@ LL(14): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 5 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 5 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 5 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 5 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 5 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 5 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 5 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 5 * SIZE(AO8) FMADD y01, a1, b5, y01 @@ -1253,21 +1245,21 @@ LL(14): FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 7 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 7 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 7 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 7 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 7 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 7 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 7 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -1287,21 +1279,21 @@ LL(14): FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 addi AO1, AO1, 8 * SIZE - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 addi AO2, AO2, 8 * SIZE - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 addi AO3, AO3, 8 * SIZE - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 addi AO4, AO4, 8 * SIZE - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 addi AO5, AO5, 8 * SIZE - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 addi AO6, AO6, 8 * SIZE - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 addi AO7, AO7, 8 * SIZE - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 addi AO8, AO8, 8 * SIZE addi BO, BO, 8 * SIZE .align 4 @@ -1341,21 +1333,21 @@ LL(15): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -1376,21 +1368,21 @@ LL(15): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 addi AO1, AO1, 4 * SIZE - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 addi AO2, AO2, 4 * SIZE - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 addi AO3, AO3, 4 * SIZE - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 addi AO4, AO4, 4 * SIZE - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 addi AO5, AO5, 4 * SIZE - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 addi AO6, AO6, 4 * SIZE - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 addi AO7, AO7, 4 * SIZE - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 addi AO8, AO8, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 @@ -1428,22 +1420,22 @@ LL(16): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 addi AO3, AO3, 2 * SIZE addi AO4, AO4, 2 * SIZE - FMADD y11, a3, b2, y11 - FMADD y12, a4, b2, y12 + FMADD y03, a3, b2, y03 + FMADD y04, a4, b2, y04 addi AO5, AO5, 2 * SIZE addi AO6, AO6, 2 * SIZE - FMADD y13, a5, b2, y13 - FMADD y14, a6, b2, y14 + FMADD y05, a5, b2, y05 + FMADD y06, a6, b2, y06 addi AO7, AO7, 2 * SIZE addi AO8, AO8, 2 * SIZE - FMADD y15, a7, b2, y15 - FMADD y16, a8, b2, y16 + FMADD y07, a7, b2, y07 + FMADD y08, a8, b2, y08 addi BO, BO, 2 * SIZE .align 4 @@ -1486,15 +1478,6 @@ LL(18): LFD a7, 7 * SIZE(CO) LFD a8, 8 * SIZE(CO) - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FADD y05, y13, y05 - FADD y06, y14, y06 - FADD y07, y15, y07 - FADD y08, y16, y08 - FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 FMADD a3, alpha, y03, a3 @@ -1530,15 +1513,6 @@ LL(19): LFDUX a7, CO, INCY LFDUX a8, CO, INCY - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FADD y05, y13, y05 - FADD y06, y14, y06 - FADD y07, y15, y07 - FADD y08, y16, y08 - FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 @@ -1580,10 +1554,6 @@ LL(20): fmr y02, y01 fmr y03, y01 fmr y04, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 DCBT(Y1, PREC) @@ -1621,13 +1591,13 @@ LL(22): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1639,13 +1609,13 @@ LL(22): FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 6 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 6 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 6 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) @@ -1662,13 +1632,13 @@ LL(22): FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 8 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 8 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 8 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1680,13 +1650,13 @@ LL(22): FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) - FMADD y09, a5, b8, y09 + FMADD y01, a5, b8, y01 LFD a5, 10 * SIZE(AO1) - FMADD y10, a6, b8, y10 + FMADD y02, a6, b8, y02 LFD a6, 10 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y03, a7, b8, y03 LFD a7, 10 * SIZE(AO3) - FMADD y12, a8, b8, y12 + FMADD y04, a8, b8, y04 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) @@ -1703,13 +1673,13 @@ LL(22): FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 12 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 12 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 12 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1721,13 +1691,13 @@ LL(22): FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 14 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 14 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 14 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 14 * SIZE(AO4) LFD b1, 17 * SIZE(BO) @@ -1744,13 +1714,13 @@ LL(22): FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 16 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 16 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 16 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1762,13 +1732,13 @@ LL(22): FMADD y04, a4, b7, y04 LFD a4, 17 * SIZE(AO4) - FMADD y09, a5, b8, y09 + FMADD y01, a5, b8, y01 LFD a5, 18 * SIZE(AO1) - FMADD y10, a6, b8, y10 + FMADD y02, a6, b8, y02 LFD a6, 18 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y03, a7, b8, y03 LFD a7, 18 * SIZE(AO3) - FMADD y12, a8, b8, y12 + FMADD y04, a8, b8, y04 LFD a8, 18 * SIZE(AO4) LFD b5, 21 * SIZE(BO) @@ -1800,13 +1770,13 @@ LL(23): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1818,13 +1788,13 @@ LL(23): FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 6 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 6 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 6 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) @@ -1841,13 +1811,13 @@ LL(23): FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 8 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 8 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 8 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1859,13 +1829,13 @@ LL(23): FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) - FMADD y09, a5, b8, y09 + FMADD y01, a5, b8, y01 LFD a5, 10 * SIZE(AO1) - FMADD y10, a6, b8, y10 + FMADD y02, a6, b8, y02 LFD a6, 10 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y03, a7, b8, y03 LFD a7, 10 * SIZE(AO3) - FMADD y12, a8, b8, y12 + FMADD y04, a8, b8, y04 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) @@ -1882,13 +1852,13 @@ LL(23): FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 12 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 12 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 12 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1900,13 +1870,13 @@ LL(23): FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 14 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 14 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 14 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 14 * SIZE(AO4) FMADD y01, a1, b5, y01 @@ -1918,13 +1888,13 @@ LL(23): FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 16 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 16 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 16 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1932,10 +1902,10 @@ LL(23): FMADD y03, a3, b7, y03 FMADD y04, a4, b7, y04 - FMADD y09, a5, b8, y09 - FMADD y10, a6, b8, y10 - FMADD y11, a7, b8, y11 - FMADD y12, a8, b8, y12 + FMADD y01, a5, b8, y01 + FMADD y02, a6, b8, y02 + FMADD y03, a7, b8, y03 + FMADD y04, a8, b8, y04 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE @@ -1975,13 +1945,13 @@ LL(24): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1993,13 +1963,13 @@ LL(24): FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 6 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 6 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 6 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 6 * SIZE(AO4) LFD b1, 5 * SIZE(BO) @@ -2016,13 +1986,13 @@ LL(24): FMADD y04, a4, b1, y04 LFD a4, 7 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 8 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 8 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 8 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -2030,13 +2000,13 @@ LL(24): FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 addi AO1, AO1, 8 * SIZE - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 addi AO2, AO2, 8 * SIZE - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 addi AO3, AO3, 8 * SIZE - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 addi AO4, AO4, 8 * SIZE addi BO, BO, 8 * SIZE @@ -2070,13 +2040,13 @@ LL(25): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -2084,13 +2054,13 @@ LL(25): FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 addi AO1, AO1, 4 * SIZE - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 addi AO2, AO2, 4 * SIZE - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 addi AO3, AO3, 4 * SIZE - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 addi AO4, AO4, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 @@ -2117,13 +2087,13 @@ LL(26): FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 addi AO1, AO1, 2 * SIZE - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 addi AO2, AO2, 2 * SIZE - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 addi AO3, AO3, 2 * SIZE - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 addi AO4, AO4, 2 * SIZE addi BO, BO, 2 * SIZE .align 4 @@ -2156,11 +2126,6 @@ LL(28): LFD a3, 3 * SIZE(CO) LFD a4, 4 * SIZE(CO) - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 FMADD a3, alpha, y03, a3 @@ -2181,11 +2146,6 @@ LL(29): LFDUX a3, CO, INCY LFDUX a4, CO, INCY - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 @@ -2209,12 +2169,6 @@ LL(30): lfd y01, FZERO fmr y02, y01 - fmr y03, y01 - fmr y04, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 DCBT(Y1, PREC) @@ -2247,18 +2201,18 @@ LL(32): LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 6 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 6 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 8 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) @@ -2270,18 +2224,18 @@ LL(32): LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) - FMADD y03, a3, b6, y03 + FMADD y01, a3, b6, y01 LFD a3, 10 * SIZE(AO1) - FMADD y04, a4, b6, y04 + FMADD y02, a4, b6, y02 LFD a4, 10 * SIZE(AO2) - FMADD y09, a5, b7, y09 + FMADD y01, a5, b7, y01 LFD a5, 11 * SIZE(AO1) - FMADD y10, a6, b7, y10 + FMADD y02, a6, b7, y02 LFD a6, 11 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y01, a7, b8, y01 LFD a7, 12 * SIZE(AO1) - FMADD y12, a8, b8, y12 + FMADD y02, a8, b8, y02 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) @@ -2293,18 +2247,18 @@ LL(32): LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 14 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 14 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 15 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 15 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 16 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 16 * SIZE(AO2) LFD b1, 17 * SIZE(BO) @@ -2316,18 +2270,18 @@ LL(32): LFD a1, 17 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 17 * SIZE(AO2) - FMADD y03, a3, b6, y03 + FMADD y01, a3, b6, y01 LFD a3, 18 * SIZE(AO1) - FMADD y04, a4, b6, y04 + FMADD y02, a4, b6, y02 LFD a4, 18 * SIZE(AO2) - FMADD y09, a5, b7, y09 + FMADD y01, a5, b7, y01 LFD a5, 19 * SIZE(AO1) - FMADD y10, a6, b7, y10 + FMADD y02, a6, b7, y02 LFD a6, 19 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y01, a7, b8, y01 LFD a7, 20 * SIZE(AO1) - FMADD y12, a8, b8, y12 + FMADD y02, a8, b8, y02 LFD a8, 20 * SIZE(AO2) LFD b5, 21 * SIZE(BO) @@ -2349,18 +2303,18 @@ LL(33): LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 6 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 6 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 8 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) @@ -2372,18 +2326,18 @@ LL(33): LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) - FMADD y03, a3, b6, y03 + FMADD y01, a3, b6, y01 LFD a3, 10 * SIZE(AO1) - FMADD y04, a4, b6, y04 + FMADD y02, a4, b6, y02 LFD a4, 10 * SIZE(AO2) - FMADD y09, a5, b7, y09 + FMADD y01, a5, b7, y01 LFD a5, 11 * SIZE(AO1) - FMADD y10, a6, b7, y10 + FMADD y02, a6, b7, y02 LFD a6, 11 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y01, a7, b8, y01 LFD a7, 12 * SIZE(AO1) - FMADD y12, a8, b8, y12 + FMADD y02, a8, b8, y02 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) @@ -2395,29 +2349,29 @@ LL(33): LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 14 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 14 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 15 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 15 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 16 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 16 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 - FMADD y03, a3, b6, y03 - FMADD y04, a4, b6, y04 + FMADD y01, a3, b6, y01 + FMADD y02, a4, b6, y02 - FMADD y09, a5, b7, y09 - FMADD y10, a6, b7, y10 - FMADD y11, a7, b8, y11 - FMADD y12, a8, b8, y12 + FMADD y01, a5, b7, y01 + FMADD y02, a6, b7, y02 + FMADD y01, a7, b8, y01 + FMADD y02, a8, b8, y02 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE @@ -2454,32 +2408,32 @@ LL(34): LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) - FMADD y09, a3, b2, y09 + FMADD y01, a3, b2, y01 LFD a3, 6 * SIZE(AO1) - FMADD y10, a4, b2, y10 + FMADD y02, a4, b2, y02 LFD a4, 6 * SIZE(AO2) FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) - FMADD y09, a7, b4, y09 + FMADD y01, a7, b4, y01 LFD a7, 8 * SIZE(AO1) - FMADD y10, a8, b4, y10 + FMADD y02, a8, b4, y02 LFD a8, 8 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 - FMADD y09, a3, b6, y09 - FMADD y10, a4, b6, y10 + FMADD y01, a3, b6, y01 + FMADD y02, a4, b6, y02 FMADD y01, a5, b7, y01 addi AO1, AO1, 8 * SIZE FMADD y02, a6, b7, y02 addi AO2, AO2, 8 * SIZE - FMADD y09, a7, b8, y09 + FMADD y01, a7, b8, y01 addi BO, BO, 8 * SIZE - FMADD y10, a8, b8, y10 + FMADD y02, a8, b8, y02 nop .align 4 @@ -2504,17 +2458,17 @@ LL(35): FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 - FMADD y09, a3, b2, y09 - FMADD y10, a4, b2, y10 + FMADD y01, a3, b2, y01 + FMADD y02, a4, b2, y02 FMADD y01, a5, b3, y01 addi AO1, AO1, 4 * SIZE FMADD y02, a6, b3, y02 addi AO2, AO2, 4 * SIZE - FMADD y09, a7, b4, y09 + FMADD y01, a7, b4, y01 addi BO, BO, 4 * SIZE - FMADD y10, a8, b4, y10 + FMADD y02, a8, b4, y02 .align 4 LL(36): @@ -2531,8 +2485,8 @@ LL(36): FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 - FMADD y09, a3, b2, y09 - FMADD y10, a4, b2, y10 + FMADD y01, a3, b2, y01 + FMADD y02, a4, b2, y02 addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE @@ -2560,14 +2514,6 @@ LL(38): LFD a1, 1 * SIZE(CO) LFD a2, 2 * SIZE(CO) - FADD y01, y03, y01 - FADD y02, y04, y02 - FADD y09, y11, y09 - FADD y10, y12, y10 - - FADD y01, y09, y01 - FADD y02, y10, y02 - FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 @@ -2582,14 +2528,6 @@ LL(39): LFDUX a1, CO, INCY LFDUX a2, CO, INCY - FADD y01, y03, y01 - FADD y02, y04, y02 - FADD y09, y11, y09 - FADD y10, y12, y10 - - FADD y01, y09, y01 - FADD y02, y10, y02 - FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 @@ -2606,13 +2544,6 @@ LL(40): mr BO, XP lfd y01, FZERO - fmr y02, y01 - fmr y03, y01 - fmr y04, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 DCBT(Y1, PREC) @@ -2646,17 +2577,17 @@ LL(42): LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) @@ -2666,17 +2597,17 @@ LL(42): LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) @@ -2686,17 +2617,17 @@ LL(42): LFD a1, 17 * SIZE(AO1) LFD b1, 17 * SIZE(BO) - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 nop LFD a2, 18 * SIZE(AO1) LFD b2, 18 * SIZE(BO) - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 nop LFD a3, 19 * SIZE(AO1) LFD b3, 19 * SIZE(BO) - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop LFD a4, 20 * SIZE(AO1) LFD b4, 20 * SIZE(BO) @@ -2706,17 +2637,17 @@ LL(42): LFD a5, 21 * SIZE(AO1) LFD b5, 21 * SIZE(BO) - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 nop LFD a6, 22 * SIZE(AO1) LFD b6, 22 * SIZE(BO) - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop LFD a7, 23 * SIZE(AO1) LFD b7, 23 * SIZE(BO) - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop LFD a8, 24 * SIZE(AO1) LFD b8, 24 * SIZE(BO) @@ -2733,17 +2664,17 @@ LL(43): LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) @@ -2753,34 +2684,34 @@ LL(43): LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 - FMADD y02, a2, b2, y02 - FMADD y03, a3, b3, y03 - FMADD y04, a4, b4, y04 + FMADD y01, a2, b2, y01 + FMADD y01, a3, b3, y01 + FMADD y01, a4, b4, y01 FMADD y01, a5, b5, y01 addi AO1, AO1, 16 * SIZE - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 addi BO, BO, 16 * SIZE - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop .align 4 @@ -2811,17 +2742,17 @@ LL(44): LFD b8, 8 * SIZE(BO) FMADD y01, a1, b1, y01 - FMADD y02, a2, b2, y02 - FMADD y03, a3, b3, y03 - FMADD y04, a4, b4, y04 + FMADD y01, a2, b2, y01 + FMADD y01, a3, b3, y01 + FMADD y01, a4, b4, y01 FMADD y01, a5, b5, y01 addi AO1, AO1, 8 * SIZE - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 addi BO, BO, 8 * SIZE - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop .align 4 @@ -2841,12 +2772,12 @@ LL(45): FMADD y01, a1, b1, y01 addi AO1, AO1, 4 * SIZE - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 addi AO2, AO2, 4 * SIZE - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 addi BO, BO, 4 * SIZE - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop .align 4 @@ -2861,7 +2792,7 @@ LL(46): FMADD y01, a1, b1, y01 addi AO1, AO1, 2 * SIZE - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 addi BO, BO, 2 * SIZE .align 4 @@ -2882,10 +2813,6 @@ LL(48): LFD a1, 1 * SIZE(CO) - FADD y01, y02, y01 - FADD y03, y04, y03 - FADD y01, y03, y01 - FMADD a1, alpha, y01, a1 STFD a1, 1 * SIZE(CO) b LL(99) @@ -2893,9 +2820,7 @@ LL(48): LL(49): LFDUX a1, CO, INCY - FADD y01, y02, y01 - FADD y03, y04, y03 - FADD y01, y03, y01 + FMADD a1, alpha, f0, a1 STFDUX a1, BO, INCY .align 4 diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 302b2418..a01e1b53 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -76,6 +76,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS saxpy_kernel_64(n1, &x[i], &y[i], da); i += n1; +#if defined(__clang__) +#pragma clang loop interleave_count(2) +#endif while(i < n) { diff --git a/kernel/power/sbgemm_ncopy_16_power10.c b/kernel/power/sbgemm_ncopy_16_power10.c index c6b63301..595edfda 100644 --- a/kernel/power/sbgemm_ncopy_16_power10.c +++ b/kernel/power/sbgemm_ncopy_16_power10.c @@ -37,8 +37,11 @@ /*********************************************************************/ #include +#include #include "common.h" +typedef uint32_t vec_bf16x2 __attribute__ ((vector_size (16))); + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; @@ -82,7 +85,84 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset16 = aoffset15 + lda; aoffset += 16 * lda; - i = (m >> 1); + i = (m >> 3); + if (i > 0) { + do { + vec_bf16x2 vtemp01 = *(vec_bf16x2 *)(aoffset1); + vec_bf16x2 vtemp02 = *(vec_bf16x2 *)(aoffset2); + vec_bf16x2 vtemp03 = *(vec_bf16x2 *)(aoffset3); + vec_bf16x2 vtemp04 = *(vec_bf16x2 *)(aoffset4); + vec_bf16x2 vtemp05 = *(vec_bf16x2 *)(aoffset5); + vec_bf16x2 vtemp06 = *(vec_bf16x2 *)(aoffset6); + vec_bf16x2 vtemp07 = *(vec_bf16x2 *)(aoffset7); + vec_bf16x2 vtemp08 = *(vec_bf16x2 *)(aoffset8); + vec_bf16x2 vtemp09 = *(vec_bf16x2 *)(aoffset9); + vec_bf16x2 vtemp10 = *(vec_bf16x2 *)(aoffset10); + vec_bf16x2 vtemp11 = *(vec_bf16x2 *)(aoffset11); + vec_bf16x2 vtemp12 = *(vec_bf16x2 *)(aoffset12); + vec_bf16x2 vtemp13 = *(vec_bf16x2 *)(aoffset13); + vec_bf16x2 vtemp14 = *(vec_bf16x2 *)(aoffset14); + vec_bf16x2 vtemp15 = *(vec_bf16x2 *)(aoffset15); + vec_bf16x2 vtemp16 = *(vec_bf16x2 *)(aoffset16); + + vec_bf16x2 vtemp17 = vec_mergeh(vtemp01, vtemp03); + vec_bf16x2 vtemp18 = vec_mergel(vtemp01, vtemp03); + vec_bf16x2 vtemp19 = vec_mergeh(vtemp02, vtemp04); + vec_bf16x2 vtemp20 = vec_mergel(vtemp02, vtemp04); + vec_bf16x2 vtemp21 = vec_mergeh(vtemp05, vtemp07); + vec_bf16x2 vtemp22 = vec_mergel(vtemp05, vtemp07); + vec_bf16x2 vtemp23 = vec_mergeh(vtemp06, vtemp08); + vec_bf16x2 vtemp24 = vec_mergel(vtemp06, vtemp08); + vec_bf16x2 vtemp25 = vec_mergeh(vtemp09, vtemp11); + vec_bf16x2 vtemp26 = vec_mergel(vtemp09, vtemp11); + vec_bf16x2 vtemp27 = vec_mergeh(vtemp10, vtemp12); + vec_bf16x2 vtemp28 = vec_mergel(vtemp10, vtemp12); + vec_bf16x2 vtemp29 = vec_mergeh(vtemp13, vtemp15); + vec_bf16x2 vtemp30 = vec_mergel(vtemp13, vtemp15); + vec_bf16x2 vtemp31 = vec_mergeh(vtemp14, vtemp16); + vec_bf16x2 vtemp32 = vec_mergel(vtemp14, vtemp16); + + *(vec_bf16x2 *)(boffset + 0) = vec_mergeh(vtemp17, vtemp19); + *(vec_bf16x2 *)(boffset + 8) = vec_mergeh(vtemp21, vtemp23); + *(vec_bf16x2 *)(boffset + 16) = vec_mergeh(vtemp25, vtemp27); + *(vec_bf16x2 *)(boffset + 24) = vec_mergeh(vtemp29, vtemp31); + *(vec_bf16x2 *)(boffset + 32) = vec_mergel(vtemp17, vtemp19); + *(vec_bf16x2 *)(boffset + 40) = vec_mergel(vtemp21, vtemp23); + *(vec_bf16x2 *)(boffset + 48) = vec_mergel(vtemp25, vtemp27); + *(vec_bf16x2 *)(boffset + 56) = vec_mergel(vtemp29, vtemp31); + *(vec_bf16x2 *)(boffset + 64) = vec_mergeh(vtemp18, vtemp20); + *(vec_bf16x2 *)(boffset + 72) = vec_mergeh(vtemp22, vtemp24); + *(vec_bf16x2 *)(boffset + 80) = vec_mergeh(vtemp26, vtemp28); + *(vec_bf16x2 *)(boffset + 88) = vec_mergeh(vtemp30, vtemp32); + *(vec_bf16x2 *)(boffset + 96) = vec_mergel(vtemp18, vtemp20); + *(vec_bf16x2 *)(boffset + 104) = vec_mergel(vtemp22, vtemp24); + *(vec_bf16x2 *)(boffset + 112) = vec_mergel(vtemp26, vtemp28); + *(vec_bf16x2 *)(boffset + 120) = vec_mergel(vtemp30, vtemp32); + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + aoffset9 += 8; + aoffset10 += 8; + aoffset11 += 8; + aoffset12 += 8; + aoffset13 += 8; + aoffset14 += 8; + aoffset15 += 8; + aoffset16 += 8; + + boffset += 128; + + i--; + } while (i > 0); + } + + i = (m & 7) >> 1; if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); diff --git a/kernel/power/scal.S b/kernel/power/scal.S index 19fdd32a..5e92a88a 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -47,9 +47,11 @@ #ifndef __64BIT__ #define X r6 #define INCX r7 +#define FLAG r11 #else #define X r7 #define INCX r8 +#define FLAG r12 #endif #endif @@ -57,9 +59,11 @@ #if !defined(__64BIT__) && defined(DOUBLE) #define X r8 #define INCX r9 +#define FLAG r13 #else #define X r7 #define INCX r8 +#define FLAG r12 #endif #endif @@ -87,6 +91,10 @@ fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) + ld FLAG, 48+64+8(SP) + cmpwi cr0, FLAG, 1 + beq- cr0, LL(A1I1) + cmpwi cr0, INCX, SIZE bne- cr0, LL(A0IN) diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 65572a8c..54047a85 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -74,16 +74,58 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x ) for( i=0; i= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + if (dummy2 == 0) + for (j = 0; j < align; j++){ + x[j] = 0.0; + } + else for (j = 0; j < align; j++) { - x[j] = 0.0; + if (isfinite(x[j])) + x[j] = 0.0; + else + x[j] = NAN; } } BLASLONG n1 = (n-j) & -32; @@ -126,11 +176,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS j=n1; } #endif - + if (dummy2 == 0) while(j < n) { - - x[j]=0.0; + x[j] = 0.0; + j++; + } + else + while(j < n) + { + if (isfinite(x[j])) + x[j]=0.0; + else + x[j]=NAN; j++; } @@ -175,11 +233,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - + if (dummy2 == 0) while(j < n) { - x[i]=0.0; + i += inc_x; + j++; + } + else + while(j < n) + { + if (isfinite(x[i])) + x[i]=0.0; + else + x[i]=NAN; i += inc_x ; j++; } diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 9b47b9fc..c513285d 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -1159,9 +1159,9 @@ LL(20): LL(22): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) @@ -1169,9 +1169,9 @@ LL(22): LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1179,14 +1179,14 @@ LL(22): LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 - FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 - FMA3 f10, f17, f25, f10 + FMA4 f9, f17, f24, f9 + FMA3 f8, f17, f25, f8 FMA1 f12, f16, f26, f12 - FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 - FMA3 f14, f17, f27, f14 + FMA4 f13, f17, f26, f13 + FMA3 f12, f17, f27, f12 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) @@ -1194,14 +1194,14 @@ LL(22): LFD f27, 15 * SIZE(BO) FMA1 f0, f18, f20, f0 - FMA4 f3, f19, f20, f3 FMA2 f1, f18, f21, f1 - FMA3 f2, f19, f21, f2 + FMA4 f1, f19, f20, f1 + FMA3 f0, f19, f21, f0 FMA1 f4, f18, f22, f4 - FMA4 f7, f19, f22, f7 FMA2 f5, f18, f23, f5 - FMA3 f6, f19, f23, f6 + FMA4 f5, f19, f22, f5 + FMA3 f4, f19, f23, f4 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) @@ -1209,14 +1209,14 @@ LL(22): LFD f23, 19 * SIZE(BO) FMA1 f8, f18, f24, f8 - FMA4 f11, f19, f24, f11 FMA2 f9, f18, f25, f9 - FMA3 f10, f19, f25, f10 + FMA4 f9, f19, f24, f9 + FMA3 f8, f19, f25, f8 FMA1 f12, f18, f26, f12 - FMA4 f15, f19, f26, f15 FMA2 f13, f18, f27, f13 - FMA3 f14, f19, f27, f14 + FMA4 f13, f19, f26, f13 + FMA3 f12, f19, f27, f12 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) @@ -1224,9 +1224,9 @@ LL(22): LFD f27, 23 * SIZE(BO) FMA1 f0, f28, f20, f0 - FMA4 f3, f29, f20, f3 FMA2 f1, f28, f21, f1 - FMA3 f2, f29, f21, f2 + FMA4 f1, f29, f20, f1 + FMA3 f0, f29, f21, f0 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -1234,9 +1234,9 @@ LL(22): LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 - FMA4 f7, f29, f22, f7 FMA2 f5, f28, f23, f5 - FMA3 f6, f29, f23, f6 + FMA4 f5, f29, f22, f5 + FMA3 f4, f29, f23, f4 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) @@ -1244,14 +1244,14 @@ LL(22): LFD f23, 27 * SIZE(BO) FMA1 f8, f28, f24, f8 - FMA4 f11, f29, f24, f11 FMA2 f9, f28, f25, f9 - FMA3 f10, f29, f25, f10 + FMA4 f9, f29, f24, f9 + FMA3 f8, f29, f25, f8 FMA1 f12, f28, f26, f12 - FMA4 f15, f29, f26, f15 FMA2 f13, f28, f27, f13 - FMA3 f14, f29, f27, f14 + FMA4 f13, f29, f26, f13 + FMA3 f12, f29, f27, f12 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) @@ -1259,14 +1259,14 @@ LL(22): LFD f27, 31 * SIZE(BO) FMA1 f0, f30, f20, f0 - FMA4 f3, f31, f20, f3 FMA2 f1, f30, f21, f1 - FMA3 f2, f31, f21, f2 + FMA4 f1, f31, f20, f1 + FMA3 f0, f31, f21, f0 FMA1 f4, f30, f22, f4 - FMA4 f7, f31, f22, f7 FMA2 f5, f30, f23, f5 - FMA3 f6, f31, f23, f6 + FMA4 f5, f31, f22, f5 + FMA3 f4, f31, f23, f4 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) @@ -1274,14 +1274,14 @@ LL(22): LFD f23, 35 * SIZE(BO) FMA1 f8, f30, f24, f8 - FMA4 f11, f31, f24, f11 FMA2 f9, f30, f25, f9 - FMA3 f10, f31, f25, f10 + FMA4 f9, f31, f24, f9 + FMA3 f8, f31, f25, f8 FMA1 f12, f30, f26, f12 - FMA4 f15, f31, f26, f15 FMA2 f13, f30, f27, f13 - FMA3 f14, f31, f27, f14 + FMA4 f13, f31, f26, f13 + FMA3 f12, f31, f27, f12 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) @@ -1318,14 +1318,14 @@ LL(25): LL(26): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1333,14 +1333,14 @@ LL(26): LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 - FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 - FMA3 f10, f17, f25, f10 + FMA4 f9, f17, f24, f9 + FMA3 f8, f17, f25, f8 FMA1 f12, f16, f26, f12 - FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 - FMA3 f14, f17, f27, f14 + FMA4 f13, f17, f26, f13 + FMA3 f12, f17, f27, f12 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) @@ -1363,47 +1363,42 @@ LL(28): LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 - LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) - FADD f8, f8, f10 - FADD f9, f9, f11 - FADD f12, f12, f14 - FADD f13, f13, f15 + fmr f2, f0 + fmr f3, f1 + fmr f6, f4 + fmr f7, f5 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f5, f18 - FMADD f27, f31, f4, f19 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f4, f18 + FMADD f27, f30, f5, f19 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f4, f30, f4, f26 - FMADD f5, f30, f5, f27 + FNMSUB f0, f31, f3, f24 + FMADD f1, f31, f2, f25 + FNMSUB f4, f31, f7, f26 + FMADD f5, f31, f6, f27 - FNMSUB f24, f31, f9, f20 - FMADD f25, f31, f8, f21 - FNMSUB f26, f31, f13, f22 - FMADD f27, f31, f12, f23 + fmr f10, f8 + fmr f11, f9 + fmr f14, f12 + fmr f15, f13 - FMADD f8, f30, f8, f24 - FMADD f9, f30, f9, f25 - FMADD f12, f30, f12, f26 - FMADD f13, f30, f13, f27 + FMADD f24, f30, f8, f20 + FMADD f25, f30, f9, f21 + FMADD f26, f30, f12, f22 + FMADD f27, f30, f13, f23 -#else - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 + FNMSUB f8, f31, f11, f24 + FMADD f9, f31, f10, f25 + FNMSUB f12, f31, f15, f26 + FMADD f13, f31, f14, f27 +#else FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f5 @@ -1414,11 +1409,6 @@ LL(28): FMSUB f4, f30, f4, f18 FMADD f5, f30, f5, f19 - FADD f8, f8, f10 - FADD f9, f9, f11 - FADD f12, f12, f14 - FADD f13, f13, f15 - FMUL f20, f31, f9 FMUL f21, f31, f8 FMUL f22, f31, f13 @@ -1616,15 +1606,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1646,15 +1636,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) @@ -1676,15 +1666,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) @@ -1706,15 +1696,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) @@ -1736,15 +1726,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) @@ -1766,15 +1756,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) @@ -1796,15 +1786,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) @@ -1826,15 +1816,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) @@ -1883,20 +1873,20 @@ LL(36): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 4 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f17, 5 * SIZE(AO) LFD f19, 7 * SIZE(AO) @@ -1916,52 +1906,42 @@ LL(38): LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 - LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) - FADD f4, f4, f12 - FADD f5, f5, f13 - FADD f6, f6, f14 - FADD f7, f7, f15 + fmr f8, f0 + fmr f9, f1 + fmr f10, f2 + fmr f11, f3 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f3, f18 - FMADD f27, f31, f2, f19 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f2, f18 + FMADD f27, f30, f3, f19 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f2, f30, f2, f26 - FMADD f3, f30, f3, f27 + FNMSUB f0, f31, f9, f24 + FMADD f1, f31, f8, f25 + FNMSUB f2, f31, f11, f26 + FMADD f3, f31, f10, f27 - FNMSUB f24, f31, f5, f20 - FMADD f25, f31, f4, f21 - FNMSUB f26, f31, f7, f22 - FMADD f27, f31, f6, f23 + fmr f12, f4 + fmr f13, f5 + fmr f14, f6 + fmr f15, f7 - FMADD f4, f30, f4, f24 - FMADD f5, f30, f5, f25 - FMADD f6, f30, f6, f26 - FMADD f7, f30, f7, f27 + FMADD f24, f30, f4, f20 + FMADD f25, f30, f5, f21 + FMADD f26, f30, f6, f22 + FMADD f27, f30, f7, f23 -#else - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 - - FADD f4, f4, f12 - FADD f5, f5, f13 - FADD f6, f6, f14 - FADD f7, f7, f15 + FNMSUB f4, f31, f13, f24 + FMADD f5, f31, f12, f25 + FNMSUB f6, f31, f15, f26 + FMADD f7, f31, f14, f27 +#else FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 @@ -2101,14 +2081,14 @@ LL(40): LL(42): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) @@ -2119,14 +2099,14 @@ LL(42): LFD f23, 7 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2137,14 +2117,14 @@ LL(42): LFD f23, 11 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) @@ -2155,14 +2135,14 @@ LL(42): LFD f23, 15 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2202,14 +2182,14 @@ LL(45): LL(46): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) @@ -2231,27 +2211,22 @@ LL(48): LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 + fmr f2, f0 + fmr f3, f1 + fmr f6, f4 + fmr f7, f5 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f5, f20 - FMADD f27, f31, f4, f21 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f4, f20 + FMADD f27, f30, f5, f21 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f4, f30, f4, f26 - FMADD f5, f30, f5, f27 + FNMSUB f0, f31, f3, f24 + FMADD f1, f31, f2, f25 + FNMSUB f4, f31, f7, f26 + FMADD f5, f31, f6, f27 #else - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 - FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f5 @@ -2401,10 +2376,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2416,10 +2391,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2436,10 +2411,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) @@ -2451,10 +2426,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) @@ -2471,10 +2446,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 20 * SIZE(AO) LFD f17, 21 * SIZE(AO) @@ -2486,10 +2461,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) @@ -2506,10 +2481,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 28 * SIZE(AO) LFD f17, 29 * SIZE(AO) @@ -2521,10 +2496,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) @@ -2573,10 +2548,10 @@ LL(56): LFD f16, 4 * SIZE(AO) LFD f18, 6 * SIZE(AO) - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f17, 5 * SIZE(AO) LFD f19, 7 * SIZE(AO) @@ -2595,27 +2570,22 @@ LL(58): LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 + fmr f8, f0 + fmr f9, f1 + fmr f10, f2 + fmr f11, f3 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f3, f18 - FMADD f27, f31, f2, f19 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f2, f18 + FMADD f27, f30, f3, f19 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f2, f30, f2, f26 - FMADD f3, f30, f3, f27 + FNMSUB f0, f31, f9, f24 + FMADD f1, f31, f8, f25 + FNMSUB f2, f31, f11, f26 + FMADD f3, f31, f10, f27 #else - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 - FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 @@ -2735,9 +2705,9 @@ LL(60): LL(62): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2745,9 +2715,9 @@ LL(62): LFD f21, 5 * SIZE(BO) FMA1 f0, f18, f22, f0 - FMA4 f3, f19, f22, f3 FMA2 f1, f18, f23, f1 - FMA3 f2, f19, f23, f2 + FMA4 f1, f19, f22, f1 + FMA3 f0, f19, f23, f0 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) @@ -2755,9 +2725,9 @@ LL(62): LFD f23, 7 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2765,9 +2735,9 @@ LL(62): LFD f21, 9 * SIZE(BO) FMA1 f0, f18, f22, f0 - FMA4 f3, f19, f22, f3 FMA2 f1, f18, f23, f1 - FMA3 f2, f19, f23, f2 + FMA4 f1, f19, f22, f1 + FMA3 f0, f19, f23, f0 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) @@ -2803,11 +2773,11 @@ LL(65): LL(66): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 - LFD f20, 2 * SIZE(BO) FMA2 f1, f16, f21, f1 LFD f16, 2 * SIZE(AO) - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + LFD f20, 2 * SIZE(BO) + FMA3 f0, f17, f21, f0 LFD f17, 3 * SIZE(AO) LFD f21, 3 * SIZE(BO) @@ -2821,20 +2791,17 @@ LL(68): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) - FADD f0, f0, f2 - FADD f1, f1, f3 + fmr f2, f0 + fmr f3, f1 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 + FNMSUB f0, f31, f3, f24 + FMADD f1, f31, f2, f25 #else - FADD f0, f0, f2 - FADD f1, f1, f3 - FMUL f16, f31, f1 FMUL f17, f31, f0 diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 0068138e..6b7392d0 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,6 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #if defined(DOUBLE) diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 15bcd228..67f81cac 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -99,26 +99,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index d7fb86ea..27abc0ff 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -114,6 +114,11 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * vy = VFMULVF_FLOAT(vy, beta, vl); VSEV_FLOAT (y, vy, vl); } + } else if (inc_y == 0) { + FLOAT vf = y[0]; + for (; n > 0; n--) + vf *= beta; + y[0] = vf; } else { BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { @@ -134,6 +139,13 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); VSEV_FLOAT (y, vy, vl); } + } else if (inc_y == 0) { + FLOAT vf = y[0]; + for (; n > 0; n--) { + vf = (vf * beta) + (x[0] * alpha); + x += inc_x; + } + y[0] = vf; } else if (1 == inc_x) { BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { diff --git a/kernel/riscv64/scal.c b/kernel/riscv64/scal.c index 4ef49e29..bd53fcff 100644 --- a/kernel/riscv64/scal.c +++ b/kernel/riscv64/scal.c @@ -43,9 +43,24 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (n <= 0) || (inc_x <= 0)) return(0); + if (dummy2 == 1) { + while(j < n) + { - while(j < n) - { + if ( da == 0.0 ) + if (isfinite(x[i])) + x[i]=0.0; + else + x[i]=NAN; + else + x[i] = da * x[i] ; + + i += inc_x ; + j++; + } + } else { + while(j < n) + { if ( da == 0.0 ) x[i]=0.0; @@ -54,7 +69,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS i += inc_x ; j++; - + } } return 0; diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index 2c273fb6..827ab120 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T v0; if(inc_x == 1) { - if(da == 0.0) { + if(dummy2 == 0 && da == 0.0) { int gvl = VSETVL_MAX; v0 = VFMVVF_FLOAT(0.0, gvl); for (size_t vl; n > 0; n -= vl, x += vl) { @@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); - if(da == 0.0) { + if(dummy2 == 0 && da == 0.0) { int gvl = VSETVL_MAX; v0 = VFMVVF_FLOAT(0.0, gvl); for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c index 8fa9315f..4792b514 100644 --- a/kernel/riscv64/scal_vector.c +++ b/kernel/riscv64/scal_vector.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T v0, v1; unsigned int gvl = 0; if(inc_x == 1){ - if(da == 0.0){ + if(dummy2 == 0 && da == 0.0){ memset(&x[0], 0, n * sizeof(FLOAT)); }else{ gvl = VSETVL(n); @@ -96,7 +96,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } } }else{ - if(da == 0.0){ + if(dummy2 == 0 && da == 0.0){ BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG ix = 0; gvl = VSETVL(n); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index 66e38c1e..9bf5bdd5 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -79,8 +79,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG stride_x = inc_x2 * sizeof(FLOAT); BLASLONG stride_y = inc_y2 * sizeof(FLOAT); + BLASLONG ix; FLOAT_V_T vx0, vx1, vy0, vy1; FLOAT_VX2_T vxx2, vyx2; + FLOAT temp; if ( beta_r == 0.0 && beta_i == 0.0) { @@ -125,53 +127,74 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL if ( alpha_r == 0.0 && alpha_i == 0.0 ) { - for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) - { - vl = VSETVL(n); - - vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - vy0 = VGET_VX2(vyx2, 0); - vy1 = VGET_VX2(vyx2, 1); - - v0 = VFMULVF_FLOAT(vy1, beta_i, vl); - v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); - - v1 = VFMULVF_FLOAT(vy1, beta_r, vl); - v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - - v_x2 = VSET_VX2(v_x2, 0, v0); - v_x2 = VSET_VX2(v_x2, 1, v1); - VSSSEG_FLOAT(y, stride_y, v_x2, vl); + if ( inc_y == 0 ) { + for (; n > 0; n--) + { + temp = (beta_r * y[0] - beta_i * y[1]); + y[1] = (beta_r * y[1] + beta_i * y[0]); + y[0] = temp; + } + } else { + for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + v0 = VFMULVF_FLOAT(vy1, beta_i, vl); + v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); + + v1 = VFMULVF_FLOAT(vy1, beta_r, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + VSSSEG_FLOAT(y, stride_y, v_x2, vl); + } } } else { - for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) - { - vl = VSETVL(n); - - vxx2 = VLSSEG_FLOAT(x, stride_x, vl); - vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - - vx0 = VGET_VX2(vxx2, 0); - vx1 = VGET_VX2(vxx2, 1); - vy0 = VGET_VX2(vyx2, 0); - vy1 = VGET_VX2(vyx2, 1); - - v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); - v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); - v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); - v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); - - v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); - v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); - v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); - v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - - v_x2 = VSET_VX2(v_x2, 0, v0); - v_x2 = VSET_VX2(v_x2, 1, v1); - - VSSSEG_FLOAT(y, stride_y, v_x2, vl); + if ( inc_y == 0 ) { + ix = 0; + for (; n > 0; n--) { + temp = (alpha_r * x[ix] - alpha_i * x[ix+1] ) + + (beta_r * y[0] - beta_i * y[1]); + y[1] = (alpha_r * x[ix+1] + alpha_i * x[ix]) + + (beta_r * y[1] + beta_i * y[0]); + y[0] = temp; + ix += inc_x2; + } + } else { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); + v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); + v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); + v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); + + v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); + v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + + VSSSEG_FLOAT(y, stride_y, v_x2, vl); + } } } } diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c index b2d537d0..8499145f 100644 --- a/kernel/riscv64/zscal.c +++ b/kernel/riscv64/zscal.c @@ -61,7 +61,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F { temp = - da_i * x[ip+1] ; if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; - x[ip+1] = da_i * x[ip] ; + if (!isinf(x[ip+1])) + x[ip+1] = da_i * x[ip] ; + else x[ip+1] = NAN; } } else diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 4c361f15..9d494bfc 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1066,31 +1066,123 @@ static void init_parameter(void) { } #else // (ARCH_MIPS64) #if (ARCH_LOONGARCH64) +static int get_L3_size() { + int ret = 0, id = 0x14; + __asm__ volatile ( + "cpucfg %[ret], %[id]" + : [ret]"=r"(ret) + : [id]"r"(id) + : "memory" + ); + return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB +} static void init_parameter(void) { #ifdef BUILD_BFLOAT16 TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif + +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; +#endif + +#if defined(LOONGSON3R5) + int L3_size = get_L3_size(); +#ifdef SMP + if(blas_num_threads == 1){ +#endif + //single thread + if (L3_size == 32){ // 3C5000 and 3D5000 + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 8192; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 289; + TABLE_NAME.dgemm_r = 4096; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 4096; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 2048; + } else { // 3A5000 and 3C5000L + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 4096; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 300; + TABLE_NAME.dgemm_r = 3024; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 2048; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 1024; + } +#ifdef SMP + }else{ + //multi thread + if (L3_size == 32){ // 3C5000 and 3D5000 + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 1024; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 289; + TABLE_NAME.dgemm_r = 342; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 512; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 512; + } else { // 3A5000 and 3C5000L + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 2048; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 300; + TABLE_NAME.dgemm_r = 738; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 1024; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 1024; + } + } +#endif +#else TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_BFLOAT16 - TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; -#endif + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#endif #ifdef BUILD_BFLOAT16 TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif - TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; - TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; - TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; - TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; } #else // (ARCH_LOONGARCH64) #if (ARCH_POWER) @@ -1152,6 +1244,36 @@ static void init_parameter(void) { } #else //ZARCH +#if (ARCH_RISCV64) +static void init_parameter(void) { + +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; +#endif + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; +#endif + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + + +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; +#endif + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +} +#else //RISCV64 + #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1248,6 +1370,10 @@ static __inline__ int get_l2_size(void){ int eax, ebx, ecx, edx, l2; + l2 = readenv_atoi("OPENBLAS_L2_SIZE"); + if (l2 != 0) + return l2; + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); l2 = BITMASK(ecx, 16, 0xffff); @@ -1950,6 +2076,7 @@ static void init_parameter(void) { } +#endif //RISCV64 #endif //POWER #endif //ZARCH #endif //(ARCH_LOONGARCH64) diff --git a/kernel/sparc/scal.S b/kernel/sparc/scal.S index 36d9ce2a..fd61c820 100644 --- a/kernel/sparc/scal.S +++ b/kernel/sparc/scal.S @@ -120,8 +120,10 @@ FCLR(29) - FCMP ALPHA, FZERO - fbne .LL100 +// FCMP ALPHA, FZERO +// fbne .LL100 + b .LL100 + sll INCX, BASE_SHIFT, INCX cmp INCX, SIZE diff --git a/kernel/x86/scal.S b/kernel/x86/scal.S index 377d4ef6..6620d316 100644 --- a/kernel/x86/scal.S +++ b/kernel/x86/scal.S @@ -57,12 +57,15 @@ #ifdef XDOUBLE movl 44(%esp),%edi movl 48(%esp),%esi + movl 64(%esp),%ecx #elif defined(DOUBLE) movl 36(%esp),%edi movl 40(%esp),%esi + movl 56(%esp),%ecx #else movl 32(%esp),%edi movl 36(%esp),%esi + movl 52(%esp),%ecx #endif ftst @@ -70,6 +73,9 @@ andb $68, %ah je .L300 # Alpha != ZERO + cmpl $1,%ecx # dummy2 flag + je .L300 + /* Alpha == ZERO */ cmpl $1,%esi jne .L104 diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 95a99b8b..212a2159 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -259,11 +259,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while(j < n1) { + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else temp0 = -da_i * x[i+1]; + if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; + else + x[i+1] = NAN; x[i] = temp0; + if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) + temp1 = NAN; + else temp1 = -da_i * x[i+1+inc_x]; - x[i+1+inc_x] = da_i * x[i+inc_x]; + if (!isinf(x[i+1+inc_x])) + x[i+1+inc_x] = da_i * x[i+inc_x]; + else x[i+1+inc_x] = NAN; x[i+inc_x] = temp1; i += 2*inc_x ; j+=2; @@ -272,9 +283,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while(j < n) { - - temp0 = -da_i * x[i+1]; + + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i+1]; + if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; + else x[i+1] = NAN; x[i] = temp0; i += inc_x ; j++; @@ -365,42 +381,51 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else cscal_kernel_16_zero_r(n1 , alpha , x); else - if ( da_i == 0 ) - cscal_kernel_16_zero_i(n1 , alpha , x); - else cscal_kernel_16(n1 , alpha , x); i = n1 << 1; j = n1; } - - if ( da_r == 0.0 ) + if ( da_r == 0.0 || isnan(da_r) ) { - if ( da_i == 0.0 ) { - + FLOAT res=0.0; + if (isnan(da_r)) res= da_r; while(j < n) { - - x[i]=0.0; - x[i+1]=0.0; + x[i]=res; + x[i+1]=res; i += 2 ; j++; } } - else + else if (isinf(da_r)) { + while(j < n) + { + x[i]= NAN; + x[i+1] = da_r; + i += 2 ; + j++; + + } + + } else { while(j < n) { - temp0 = -da_i * x[i+1]; - x[i+1] = da_i * x[i]; - x[i] = temp0; + if (isinf(x[i])) + temp0 = NAN; + if (!isinf(x[i+1])) + x[i+1] = da_i * x[i]; + else x[i+1] = NAN; + if ( x[i] == x[i]) //preserve NaN + x[i] = temp0; i += 2 ; j++; diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 05c5c7f1..e039d901 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -43,21 +43,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) { - BLASLONG i; - FLOAT alpha = *da; - - for( i=0; i 0 ) - { - dscal_kernel_inc_8(n1, &da, x, inc_x); - i = n1 * inc_x; - j = n1; - } - - while(j < n) - { - - x[i] *= da; - i += inc_x ; - j++; - - } - - } - - return(0); - } - - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - if ( da == 0.0 ) - dscal_kernel_8_zero(n1 , &da , x); - else - dscal_kernel_8(n1 , &da , x); - } - - if ( da == 0.0 ) - { - for ( i=n1 ; i 0 ) + { + dscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + while(j < n) + { + x[i] *= da; + i += inc_x ; + j++; + } + } + else + { + BLASLONG n1 = n & -8; + if ( n1 > 0) + dscal_kernel_8(n1 , &da , x); + for ( i = n1 ; i < n; i++ ) + x[i] *= da; + } + } + else + { + if ( inc_x != 1 ) + { + if( da == 0.0) + { + BLASLONG n1 = n & -2; + while(j < n1) + { + x[i] = 0.0; + x[i+inc_x] = 0.0; + i += 2 * inc_x ; + j += 2; + } + while(j < n) + { + x[i] = 0.0; + i += inc_x ; + j++; + } + } + else + { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + dscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + while(j < n) + { + x[i] *= da; + i += inc_x ; + j++; + } + } + } + else + { + if ( da == 0.0 ) + { + BLASLONG n1 = n & -8; + if ( n1 > 0) + dscal_kernel_8_zero(n1, &da, x); + for ( i = n1 ; i < n; i++ ) + x[i] = 0.0; + } + else + { + BLASLONG n1 = n & -8; + if ( n1 > 0) + dscal_kernel_8(n1 , &da , x); + for ( i = n1 ; i < n; i++ ) + x[i] *= da; + } + } + } + return(0); } - - diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S index 7192cecc..d30e9d36 100644 --- a/kernel/x86_64/gemm_ncopy_4.S +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -189,12 +189,16 @@ movss %xmm6, 6 * SIZE(B) movss %xmm7, 7 * SIZE(B) +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO1) PREFETCH RPREFETCHSIZE * SIZE(AO2) PREFETCH RPREFETCHSIZE * SIZE(AO3) PREFETCH RPREFETCHSIZE * SIZE(AO4) +#endif +#ifdef PREFETCHW PREFETCHW WPREFETCHSIZE * SIZE(B) +#endif movss %xmm8, 8 * SIZE(B) movss %xmm9, 9 * SIZE(B) @@ -205,29 +209,39 @@ movss %xmm14, 14 * SIZE(B) movss %xmm15, 15 * SIZE(B) #else +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO1) +#endif movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movsd 1 * SIZE(AO1), %xmm2 movhpd 1 * SIZE(AO2), %xmm2 +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO2) +#endif movsd 2 * SIZE(AO1), %xmm4 movhpd 2 * SIZE(AO2), %xmm4 movsd 3 * SIZE(AO1), %xmm6 movhpd 3 * SIZE(AO2), %xmm6 +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO3) +#endif movsd 0 * SIZE(AO3), %xmm1 movhpd 0 * SIZE(AO4), %xmm1 movsd 1 * SIZE(AO3), %xmm3 movhpd 1 * SIZE(AO4), %xmm3 +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO4) +#endif movsd 2 * SIZE(AO3), %xmm5 movhpd 2 * SIZE(AO4), %xmm5 movsd 3 * SIZE(AO3), %xmm7 movhpd 3 * SIZE(AO4), %xmm7 +#ifdef PREFETCHW PREFETCHW WPREFETCHSIZE * SIZE(B) +#endif movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movapd %xmm2, 4 * SIZE(B) @@ -342,10 +356,14 @@ movapd %xmm3, 6 * SIZE(B) #endif +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO1) PREFETCH RPREFETCHSIZE * SIZE(AO2) +#endif +#ifdef PREFETCHW PREFETCHW WPREFETCHSIZE * SIZE(B) +#endif addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S index ba7714b4..177587c4 100644 --- a/kernel/x86_64/gemm_tcopy_4.S +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -219,31 +219,41 @@ movaps %xmm3, 12 * SIZE(BO) #else +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO1) +#endif movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO2) +#endif movsd 0 * SIZE(AO2), %xmm2 movhpd 1 * SIZE(AO2), %xmm2 movsd 2 * SIZE(AO2), %xmm3 movhpd 3 * SIZE(AO2), %xmm3 +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO3) +#endif movsd 0 * SIZE(AO3), %xmm4 movhpd 1 * SIZE(AO3), %xmm4 movsd 2 * SIZE(AO3), %xmm5 movhpd 3 * SIZE(AO3), %xmm5 +#ifdef PREFETCH PREFETCH RPREFETCHSIZE * SIZE(AO4) +#endif movsd 0 * SIZE(AO4), %xmm6 movhpd 1 * SIZE(AO4), %xmm6 movsd 2 * SIZE(AO4), %xmm7 movhpd 3 * SIZE(AO4), %xmm7 +#ifdef PREFETCHW PREFETCHW WPREFETCHSIZE * SIZE(BO) +#endif movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) diff --git a/kernel/x86_64/scal_atom.S b/kernel/x86_64/scal_atom.S index 11350ea1..284ea451 100644 --- a/kernel/x86_64/scal_atom.S +++ b/kernel/x86_64/scal_atom.S @@ -60,8 +60,10 @@ #ifdef WINDOWS_ABI movq 40(%rsp), X movq 48(%rsp), INCX - + movq 64(%rsp), %r9 movaps %xmm3, %xmm0 +#else + movq 24(%rsp), %r9 #endif SAVEREGISTERS @@ -73,6 +75,10 @@ lea (, INCX, SIZE), INCX comisd %xmm0, %xmm1 jne .L100 + jp .L100 + + cmpq $1, %r9 + je .L100 /* Alpha == ZERO */ cmpq $SIZE, INCX diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S index b92688d9..88ef4a3e 100644 --- a/kernel/x86_64/scal_sse.S +++ b/kernel/x86_64/scal_sse.S @@ -60,8 +60,10 @@ #ifdef WINDOWS_ABI movq 40(%rsp), X movq 48(%rsp), INCX - + movq 64(%rsp), %r9 movaps %xmm3, %xmm0 +#else + movq 24(%rsp), %r9 #endif SAVEREGISTERS @@ -77,6 +79,8 @@ jne .L100 # Alpha != ZERO + cmpq $1, %r9 + je .L100 /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S index 20dd7fa2..485e6ef4 100644 --- a/kernel/x86_64/scal_sse2.S +++ b/kernel/x86_64/scal_sse2.S @@ -48,6 +48,7 @@ #define X ARG2 #define INCX ARG3 #endif +#define FLAG %r9 #define XX %r10 #define I %rax @@ -60,8 +61,10 @@ #ifdef WINDOWS_ABI movq 40(%rsp), X movq 48(%rsp), INCX - + movq 64(%rsp), FLAG movaps %xmm3, %xmm0 +#else + movq 24(%rsp), FLAG #endif SAVEREGISTERS @@ -76,6 +79,8 @@ jne .L100 # Alpha != ZERO jp .L100 # For Alpha = NaN + cmpq $1, FLAG + je .L100 # disable the Alpha=zero path as it does not handle x=inf or nan /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 diff --git a/kernel/x86_64/sscal.c b/kernel/x86_64/sscal.c index af1220f1..38b48eff 100644 --- a/kernel/x86_64/sscal.c +++ b/kernel/x86_64/sscal.c @@ -39,21 +39,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) { - BLASLONG i; - FLOAT alpha = *da; - - for( i=0; i 0 ) - { - sscal_kernel_inc_8(n1, &da, x, inc_x); - i = n1 * inc_x; - j = n1; - } - - while(j < n) - { - - x[i] *= da; - i += inc_x ; - j++; - - } - - } - - return(0); - } - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - if ( da == 0.0 ) - sscal_kernel_16_zero(n1 , &da , x); - else - sscal_kernel_16(n1 , &da , x); - } - - if ( da == 0.0 ) - { - for ( i=n1 ; i 0 ) + { + sscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + while(j < n) + { + x[i] *= da; + i += inc_x ; + j++; + } + } + else + { + BLASLONG n1 = n & -16; + if ( n1 > 0) + sscal_kernel_16(n1 , &da , x); + for ( i = n1 ; i < n; i++ ) + x[i] *= da; + } + } + else + { + if ( inc_x != 1 ) + { + if( da == 0.0) + { + BLASLONG n1 = n & -2; + while(j < n1) + { + x[i] = 0.0; + x[i+inc_x] = 0.0; + i += 2 * inc_x ; + j += 2; + } + while(j < n) + { + x[i] = 0.0; + i += inc_x ; + j++; + } + } + else + { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + sscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + while(j < n) + { + x[i] *= da; + i += inc_x ; + j++; + } + } + } + else + { + if ( da == 0.0 ) + { + BLASLONG n1 = n & -16; + if ( n1 > 0) + sscal_kernel_16_zero(n1, &da, x); + for ( i = n1 ; i < n; i++ ) + x[i] = 0.0; + } + else + { + BLASLONG n1 = n & -16; + if ( n1 > 0) + sscal_kernel_16(n1 , &da , x); + for ( i = n1 ; i < n; i++ ) + x[i] *= da; + } + } + } + return(0); } - - diff --git a/kernel/x86_64/tobf16.c b/kernel/x86_64/tobf16.c index a88fdcc2..e8e22895 100644 --- a/kernel/x86_64/tobf16.c +++ b/kernel/x86_64/tobf16.c @@ -144,10 +144,11 @@ void CNAME(BLASLONG n, FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLON if (inc_in == 0 || inc_out == 0 || n <= 100000) { nthreads = 1; } else { + nthreads = num_cpu_avail(1); if (n/100000 < 100) { - nthreads = 4; - } else { - nthreads = 16; + nthreads = MAX(nthreads,4); +// } else { +// nthreads = MAX(nthreads,16); } } diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse.S b/kernel/x86_64/zgemm_kernel_4x2_sse.S index 7d606aa6..5841f8b9 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_sse.S +++ b/kernel/x86_64/zgemm_kernel_4x2_sse.S @@ -102,6 +102,14 @@ #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) +#ifdef PREFETCH +#define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ; +#define PREFETCH_KERNEL4(xx) PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ; +#else +#define PREFETCH_KERNEL1(xx) +#define PREFETCH_KERNEL4(xx) +#endif + #ifndef GENERIC #define KERNEL1(xx) \ mulps %xmm0, %xmm1 ;\ @@ -111,7 +119,7 @@ addps %xmm3, %xmm9 ;\ movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ - PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + PREFETCH_KERNEL1(xx) \ mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ @@ -157,7 +165,7 @@ mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ - PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + PREFETCH_KERNEL4(xx) \ addps %xmm6, %xmm15 ;\ movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 @@ -1026,7 +1034,9 @@ .L22: mulps %xmm0, %xmm1 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 @@ -1079,7 +1089,9 @@ movaps 0 * SIZE(AO), %xmm0 mulps %xmm2, %xmm1 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif addps %xmm1, %xmm8 movaps 36 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 @@ -1285,7 +1297,9 @@ .L32: mulps %xmm0, %xmm1 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 @@ -1679,7 +1693,9 @@ .L52: mulps %xmm0, %xmm1 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 @@ -1705,7 +1721,9 @@ addps %xmm0, %xmm13 movaps 32 * SIZE(AO), %xmm0 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif mulps %xmm2, %xmm3 mulps -12 * SIZE(BO), %xmm2 @@ -1733,7 +1751,9 @@ addps %xmm2, %xmm13 movaps 48 * SIZE(AO), %xmm2 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif mulps %xmm4, %xmm5 mulps 4 * SIZE(BO), %xmm4 @@ -1761,7 +1781,9 @@ addps %xmm4, %xmm13 movaps 64 * SIZE(AO), %xmm4 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif mulps %xmm6, %xmm7 mulps 20 * SIZE(BO), %xmm6 @@ -1942,7 +1964,9 @@ .L62: mulps %xmm0, %xmm1 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 @@ -1968,7 +1992,9 @@ addps %xmm0, %xmm11 movaps 0 * SIZE(AO), %xmm0 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif mulps %xmm2, %xmm5 mulps 4 * SIZE(BO), %xmm2 @@ -2130,7 +2156,9 @@ .L72: mulps %xmm0, %xmm1 +#ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index bc79c0ca..7859ef6e 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -258,13 +258,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = NAN; else temp0 = -da_i * x[i+1]; - x[i+1] = da_i * x[i]; + if (!isinf(x[i+1])) + x[i+1] = da_i * x[i]; + else x[i+1] = NAN; x[i] = temp0; if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) temp1 = NAN; else temp1 = -da_i * x[i+1+inc_x]; - x[i+1+inc_x] = da_i * x[i+inc_x]; + if (!isinf(x[i+1+inc_x])) + x[i+1+inc_x] = da_i * x[i+inc_x]; + else x[i+1+inc_x] = NAN; x[i+inc_x] = temp1; i += 2*inc_x ; j+=2; @@ -278,7 +282,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = NAN; else temp0 = -da_i * x[i+1]; - x[i+1] = da_i * x[i]; + if (!isinf(x[i+1])) + x[i+1] = da_i * x[i]; + else x[i+1] = NAN; x[i] = temp0; i += inc_x ; j++; @@ -394,7 +400,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } } - else if (da_r < -FLT_MAX || da_r > FLT_MAX) { + else if (isinf(da_r)) { while(j < n) { x[i]= NAN; @@ -410,9 +416,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while(j < n) { temp0 = -da_i * x[i+1]; - if (x[i] < -FLT_MAX || x[i] > FLT_MAX) + if (isinf(x[i])) temp0 = NAN; - x[i+1] = da_i * x[i]; + if (!isinf(x[i+1])) + x[i+1] = da_i * x[i]; + else x[i+1] = NAN; if ( x[i] == x[i]) //preserve NaN x[i] = temp0; i += 2 ; diff --git a/kernel/x86_64/zscal_atom.S b/kernel/x86_64/zscal_atom.S index 1649b855..7713626c 100644 --- a/kernel/x86_64/zscal_atom.S +++ b/kernel/x86_64/zscal_atom.S @@ -74,7 +74,7 @@ pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L30 # Alpha_r != ZERO - + jp .L30 comisd %xmm1, %xmm15 jne .L30 # Alpha_i != ZERO diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S index 8505c67b..acd6c365 100644 --- a/kernel/x86_64/zscal_sse.S +++ b/kernel/x86_64/zscal_sse.S @@ -76,7 +76,7 @@ pxor %xmm15, %xmm15 comiss %xmm0, %xmm15 jne .L100 # Alpha_r != ZERO - + jp .L100 # Alpha_r == NAN comiss %xmm1, %xmm15 jne .L100 # Alpha_i != ZERO diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index fa61ac93..acc5ad59 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -484,7 +484,9 @@ addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) +#ifdef PREFETCH PREFETCH PREFETCHSIZE(A1) +#endif movapd xtemp3, xt1 mulpd a2, xt1 @@ -507,7 +509,9 @@ addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) +#ifdef PREFETCH PREFETCH PREFETCHSIZE(XX) +#endif movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 @@ -546,7 +550,9 @@ addpd a2, yy1 MOVDDUP(6 * SIZE, A2, a2) +#ifdef PREFETCH PREFETCH PREFETCHSIZE(A2) +#endif movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) @@ -574,7 +580,9 @@ addpd a1, yy1 MOVDDUP(6 * SIZE, A1, a1) +#ifdef PREFETCHW PREFETCHW PREFETCHSIZE(YY) +#endif movapd xtemp4, xt1 mulpd a2, xt1 diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 1657885c..fa1fe9fe 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -442,7 +442,9 @@ addpd a1, yy1 MOVDDUP(3 * SIZE, A2, a1) +#ifdef PREFETCH PREFETCH PREFETCHSIZE(A1) +#endif movapd xtemp3, xt1 mulpd a2, xt1 @@ -465,7 +467,9 @@ addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) +#ifdef PREFETCH PREFETCH PREFETCHSIZE(XX) +#endif movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 @@ -504,7 +508,9 @@ addpd a2, yy1 MOVDDUP(5 * SIZE, A1, a2) +#ifdef PREFETCH PREFETCH PREFETCHSIZE(A2) +#endif movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) @@ -532,7 +538,9 @@ addpd a2, yy1 MOVDDUP(4 * SIZE, A2, a2) +#ifdef PREFETCH PREFETCHW PREFETCHSIZE(YY) +#endif movapd xtemp4, xt1 mulpd a3, xt1 diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index cd86db28..02b5098a 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -109,12 +109,20 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif +#ifdef PREFETCH +#define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ; +#define PREFETCH_KERNEL5(xx) PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ; +#else +#define PREFETCH_KERNEL1(xx) +#define PREFETCH_KERNEL5(xx) +#endif + #define KERNEL1(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ - PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + PREFETCH_KERNEL1(xx) \ addps %xmm11, %xmm1 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ @@ -171,7 +179,7 @@ addps %xmm9, %xmm0 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ - PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + PREFETCH_KERNEL5(xx) \ addps %xmm11, %xmm1 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index 53e5bb7f..0c3a052a 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -109,12 +109,20 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif +#ifdef PREFETCH +#define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ; +#define PREFETCH_KERNEL5(xx) PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ; +#else +#define PREFETCH_KERNEL1(xx) +#define PREFETCH_KERNEL5(xx) +#endif + #define KERNEL1(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ - PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + PREFETCH_KERNEL1(xx) \ addps %xmm11, %xmm1 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ @@ -171,7 +179,7 @@ addps %xmm9, %xmm0 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ - PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + PREFETCH_KERNEL5(xx) \ addps %xmm11, %xmm1 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index 20b93e19..518e1b4f 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -109,12 +109,20 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif +#ifdef PREFETCH +#define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ; +#define PREFETCH_KERNEL5(xx) PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ; +#else +#define PREFETCH_KERNEL1(xx) +#define PREFETCH_KERNEL5(xx) +#endif + #define KERNEL1(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ - PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + PREFETCH_KERNEL1(xx) \ addps %xmm11, %xmm1 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ @@ -171,7 +179,7 @@ addps %xmm9, %xmm0 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ - PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + PREFETCH_KERNEL5(xx) \ addps %xmm11, %xmm1 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index 57bb89c0..e623f306 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -234,12 +234,23 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { while (j < n1) { - - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; + if (!isinf(x[i + 1])) + x[i + 1] = da_i * x[i]; + else + x[i + 1] = NAN; x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; + if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) + temp1 = NAN; + else + temp1 = -da_i * x[i + 1 + inc_x]; + if (!isinf(x[i + 1 + inc_x])) + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + else + x[i + 1 + inc_x] = NAN; x[i + inc_x] = temp1; i += 2 * inc_x; j += 2; @@ -247,9 +258,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } while (j < n) { - - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; + if (isinf(x[i + 1])) + x[i + 1] = NAN; + else + x[i + 1] = da_i * x[i]; x[i] = temp0; i += inc_x; j++; @@ -332,26 +348,42 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, j = n1; } - if (da_r == 0.0) { + if (da_r == 0.0 || isnan(da_r)) { if (da_i == 0.0) { - + float res = 0.0; + if (isnan(da_r)) res = da_r; while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; + x[i] = res; + x[i + 1] = res; i += 2; j++; } + } else if (isinf(da_r)) { + while(j < n) + { + + x[i]= NAN; + x[i+1] = da_r; + i += 2 ; + j++; + + } } else { while (j < n) { temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; + if (isinf(x[i])) temp0 = NAN; + if (!isinf(x[i + 1])) + x[i + 1] = da_i * x[i]; + else + x[i + 1] = NAN; + if (x[i] == x[i]) + x[i] = temp0; i += 2; j++; diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index a5a5e346..14695602 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -96,20 +96,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, if (inc_x == 1) { if (da == 0.0) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - dscal_kernel_16_zero(n1, x); - j = n1; + + if (dummy2 == 0) { + BLASLONG n1 = n & -16; + if (n1 > 0) { + dscal_kernel_16_zero(n1, x); + j = n1; + } + + while (j < n) { + x[j] = 0.0; + j++; + } + } else { + while (j < n) { + if (isfinite(x[j])) + x[j] = 0.0; + else + x[j] = NAN; + j++; + } } - - while (j < n) { - - x[j] = 0.0; - j++; - } - + } else { BLASLONG n1 = n & -16; @@ -127,11 +135,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, } else { if (da == 0.0) { - + if (dummy2 == 0) { BLASLONG n1 = n & -4; - while (j < n1) { - x[i] = 0.0; x[i + inc_x] = 0.0; x[i + 2 * inc_x] = 0.0; @@ -139,11 +145,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, i += inc_x * 4; j += 4; - } + } while (j < n) { - - x[i] = 0.0; + if (dummy2==0 || isfinite(x[i])) + x[i] = 0.0; + else + x[i] = NAN; i += inc_x; j++; } diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index da2f49ea..67772751 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -95,21 +95,31 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, if (inc_x == 1) { - if (da == 0.0) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - sscal_kernel_32_zero(n1, x); - j = n1; - } - - while (j < n) { - - x[j] = 0.0; - j++; + if (da == 0.0 || !isfinite(da)) { + if (dummy2 == 0) { + BLASLONG n1 = n & -32; + if (n1 > 0) { + + sscal_kernel_32_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + } else { + float res = 0.0; + if (!isfinite(da)) res = NAN; + while (j < n) { + if (isfinite(x[i])) + x[j] = res; + else + x[j] = NAN; + j++; + } } - } else { BLASLONG n1 = n & -32; @@ -126,26 +136,37 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, } else { - if (da == 0.0) { - - BLASLONG n1 = n & -2; - - while (j < n1) { - - x[i] = 0.0; - x[i + inc_x] = 0.0; - - i += inc_x * 2; - j += 2; - - } - while (j < n) { - - x[i] = 0.0; - i += inc_x; - j++; - } - + if (da == 0.0 || !isfinite(da)) { + if (dummy2 == 0) { + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i] = 0.0; + x[i + inc_x] = 0.0; + + i += inc_x * 2; + j += 2; + + } + while (j < n) { + + x[i] = 0.0; + i += inc_x; + j++; + } + } else { + while (j < n) { + float res = 0.0; + if (!isfinite(da)) res = NAN; + if (isfinite(x[i])) + x[i] = res; + else + x[i] = NAN; + i += inc_x; + j++; + } + } } else { BLASLONG n1 = n & -2; diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4160a1a7..36466a6e 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -237,13 +237,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = NAN; else temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; + if (!isinf(x[i + 1])) + x[i + 1] = da_i * x[i]; + else + x[i + 1] = NAN; x[i] = temp0; if (isnan(x[i + inc_x]) || isinf(x[i + inc_x])) temp1 = NAN; else temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; + if (!isinf(x[i + 1 + inc_x])) + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + else + x[i + 1 + inc_x] = NAN; x[i + inc_x] = temp1; i += 2 * inc_x; j += 2; @@ -256,7 +262,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = NAN; else temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; + if (!isinf(x[i +1])) + x[i + 1] = da_i * x[i]; + else + x[i + 1] = NAN; x[i] = temp0; i += inc_x; j++; @@ -330,7 +339,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, zscal_kernel_8_zero(n1, x); else zscal_kernel_8(n1, da_r, da_i, x); - else if (da_i == 0) + else if (da_i == 0 && da_r == da_r) zscal_kernel_8_zero_i(n1, alpha, x); else zscal_kernel_8(n1, da_r, da_i, x); @@ -339,29 +348,41 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, j = n1; } - if (da_r == 0.0) { + if (da_r == 0.0 || isnan(da_r)) { if (da_i == 0.0) { - + double res= 0.0; + if (isnan(da_r)) res = da_r; while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; + x[i] = res; + x[i + 1] = res; i += 2; j++; } + } else if (isinf(da_r)) { + while (j < n) { + x[i] = NAN; + x[i + 1] = da_r; + i += 2; + j++; + } } else { while (j < n) { - if (isnan(x[i]) || isinf(x[i])) + if (isinf(x[i])) temp0 = NAN; else temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; + if (!isinf(x[i + 1])) + x[i + 1] = da_i * x[i]; + else + x[i + 1] = NAN; + if (x[i]==x[i]) + x[i] = temp0; i += 2; j++; diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile index 75411deb..bc39a30f 100644 --- a/lapack/laswp/riscv64/Makefile +++ b/lapack/laswp/riscv64/Makefile @@ -1,6 +1,11 @@ TOPDIR = ../../.. include ../../../Makefile.system +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP LASWP = ../generic/laswp_k.c endif diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 7d6bcd77..6a2e4d43 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LN, sa, sb, args -> nthreads); #endif } } diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index 1f142727..de7d3337 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UC, sa, sb, args -> nthreads); #endif } } diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..374b03e3 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,59 @@ +site_name: OpenBLAS +site_url: https://openblas.net/docs/ +repo_url: https://github.com/OpenMathLib/OpenBLAS +copyright: Copyright © 2012- OpenBLAS contributors + +theme: + name: material + logo: logo.svg + favicon: logo.svg + features: + - header.autohide + palette: + # Palette toggle for dark mode + - scheme: slate + primary: blue grey + toggle: + icon: material/brightness-4 + name: Switch to light mode + + # Palette toggle for light mode + - scheme: default + primary: blue grey + toggle: + icon: material/brightness-7 + name: Switch to dark mode + +plugins: + - search + - git-revision-date-localized: + enable_creation_date: true + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - footnotes + - pymdownx.tabbed: + alternate_style: true + - toc: + toc_depth: 4 + +nav: + - index.md + - install.md + - user_manual.md + - extensions.md + - developers.md + - build_system.md + - distributing.md + - ci.md + - about.md + - faq.md + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/OpenMathLib/OpenBLAS + - icon: material/license + link: https://github.com/OpenMathLib/OpenBLAS/LICENSE diff --git a/param.h b/param.h index 445bab08..2618e1f6 100644 --- a/param.h +++ b/param.h @@ -2553,7 +2553,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER9) && defined(OS_LINUX) +#if defined(POWER9) && (defined(OS_LINUX) || defined(OS_FREEBSD)) #define SNUMOPT 16 #define DNUMOPT 8 @@ -2842,7 +2842,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 2 #define DNUMOPT 2 -#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_A 0x20000 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL @@ -2856,7 +2856,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 1 #else -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 6 #define DGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 16 @@ -2864,6 +2864,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 16 #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_MN 96 #endif #define QGEMM_DEFAULT_UNROLL_N 2 @@ -2872,20 +2873,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define QGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 -#define SGEMM_DEFAULT_P 256 -#define DGEMM_DEFAULT_P 32 +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p #define CGEMM_DEFAULT_P 128 -#define ZGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P zgemm_p -#define SGEMM_DEFAULT_R 1024 -#define DGEMM_DEFAULT_R 858 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R zgemm_r -#define SGEMM_DEFAULT_Q 256 -#define DGEMM_DEFAULT_Q 152 +#define SGEMM_DEFAULT_Q sgemm_q +#define DGEMM_DEFAULT_Q dgemm_q #define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q zgemm_q #define SYMV_P 16 #endif @@ -3547,8 +3548,10 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 4 #else #define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 8 #endif #define SGEMM_DEFAULT_UNROLL_M 16 diff --git a/test/Makefile b/test/Makefile index 6a50b6c9..cfb2d41f 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,6 +1,9 @@ TOPDIR = .. include ../Makefile.system ifeq ($(F_COMPILER),GFORTRAN) +ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) + override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 +endif override FFLAGS += -fno-tree-vectorize endif @@ -186,8 +189,11 @@ endif endif +ifeq ($(SUPPORT_GEMM3M),1) +level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m +else level3: $(B3) $(S3) $(D3) $(C3) $(Z3) - +endif ifneq ($(CROSS), 1) rm -f ?BLAT3.SUMM @@ -260,7 +266,7 @@ endif endif -level3_3m : zblat3_3m cblat3_3m +level3_3m: zblat3_3m cblat3_3m ifneq ($(CROSS), 1) rm -f ?BLAT3_3M.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index bc74233a..de589458 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -86,14 +86,26 @@ main (int argc, char *argv[]) { blasint m, n, k; int i, j, l; - blasint x; + blasint x, y; int ret = 0; int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; for (x = 0; x <= loop; x++) + { + for (y = 0; y < 4; y++) { + if ((y == 0) || (y == 2)) { + transA = 'N'; + } else { + transA = 'T'; + } + if ((y == 0) || (y == 1)) { + transB = 'N'; + } else { + transB = 'T'; + } m = k = n = x; float A[m * k]; float B[k * n]; @@ -104,43 +116,55 @@ main (int argc, char *argv[]) blasint one=1; for (j = 0; j < m; j++) - { - for (i = 0; i < m; i++) - { - A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - C[j * k + i] = 0; - sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); - sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); - AA[j * k + i].v = atmp; - BB[j * k + i].v = btmp; - CC[j * k + i] = 0; - DD[j * k + i] = 0; - } - } + { + for (i = 0; i < m; i++) + { + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + C[j * k + i] = 0; + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; + CC[j * k + i] = 0; + DD[j * k + i] = 0; + } + } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, - &m, (bfloat16*)BB, &k, &beta, CC, &m); + &m, (bfloat16*)BB, &k, &beta, CC, &m); + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) - ret++; - if (transA == 'N' && transB == 'N') - { - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - { - DD[i * m + j] += - float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); - } - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - if (CC[i * m + j] != DD[i * m + j]) - ret++; - } + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (transA == 'N' && transB == 'N') + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } else if (transA == 'T' && transB == 'N') + { + DD[i * m + j] += + float16to32 (AA[k * j + l]) * float16to32 (BB[l + k * i]); + } else if (transA == 'N' && transB == 'T') + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[i + l * n]); + } else if (transA == 'T' && transB == 'T') + { + DD[i * m + j] += + float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]); + } + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + if (CC[i * m + j] != DD[i * m + j]) + ret++; } + } + if (ret != 0) fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); return ret; diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 4771d8a2..6a61899d 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -18,6 +18,7 @@ else () test_zscal.c test_amin.c test_axpby.c + test_gemv.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index ce0f5c43..b8293709 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -14,7 +14,7 @@ UTESTEXTBIN=openblas_utest_ext include $(TOPDIR)/Makefile.system OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ - test_amin.o test_axpby.o + test_amin.o test_axpby.o test_gemv.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o OBJS_EXT=utest_main.o $(DIR_EXT)/xerbla.o $(DIR_EXT)/common.o OBJS_EXT+=$(DIR_EXT)/test_isamin.o $(DIR_EXT)/test_idamin.o $(DIR_EXT)/test_icamin.o $(DIR_EXT)/test_izamin.o @@ -58,6 +58,11 @@ OBJS = utest_main2.o endif ifeq ($(OSNAME), AIX) OBJS = utest_main2.o +OBJS_EXT = $(DIR_EXT)/utest_main2.o +endif + +ifeq ($(NO_CBLAS), 1) +override CFLAGS += -DNO_CBLAS endif all : run_test diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c index 8a6a4779..808aa545 100644 --- a/utest/test_extensions/common.c +++ b/utest/test_extensions/common.c @@ -69,7 +69,7 @@ float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint for (j = 0; j < cols; j++) { a_ptr[j] -= b_ptr[j]; } - norm += cblas_snrm2(cols, a_ptr, inc); + norm += BLASFUNC(snrm2)(&cols, a_ptr, &inc); a_ptr += ld; b_ptr += ld; @@ -92,7 +92,7 @@ double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blas for (j = 0; j < cols; j++) { a_ptr[j] -= b_ptr[j]; } - norm += cblas_dnrm2(cols, a_ptr, inc); + norm += BLASFUNC(dnrm2)(&cols, a_ptr, &inc); a_ptr += ld; b_ptr += ld; @@ -256,4 +256,4 @@ void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src a_dst[i*lda_dst+j+1] = (-1.0) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j]; } } -} \ No newline at end of file +} diff --git a/utest/test_extensions/test_caxpby.c b/utest/test_extensions/test_caxpby.c index 221a48ac..8adf5b3e 100644 --- a/utest/test_extensions/test_caxpby.c +++ b/utest/test_extensions/test_caxpby.c @@ -96,7 +96,7 @@ static float check_caxpby(blasint n, float *alpha, blasint incx, float *beta, bl // Find the norm of differences return BLASFUNC(scnrm2)(&n, data_caxpby.y_test, &incy_abs); } - +#ifndef NO_CBLAS /** * C API specific function * Test caxpby by comparing it with cscal and caxpy. @@ -146,7 +146,7 @@ static float c_api_check_caxpby(blasint n, float *alpha, blasint incx, float *be // Find the norm of differences return cblas_scnrm2(n, data_caxpby.y_test, incy_abs); } - +#endif /** * Fortran API specific test * Test caxpby by comparing it with cscal and caxpy. @@ -388,6 +388,7 @@ CTEST(caxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test caxpby by comparing it with cscal and caxpy. @@ -629,3 +630,4 @@ CTEST(caxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c index 9b87ad9f..55c52137 100644 --- a/utest/test_extensions/test_cgeadd.c +++ b/utest/test_extensions/test_cgeadd.c @@ -62,13 +62,14 @@ static void cgeadd_trusted(blasint m, blasint n, float *alpha, float *aptr, blasint lda, float *beta, float *cptr, blasint ldc) { blasint i; + blasint one=1; lda *= 2; ldc *= 2; for (i = 0; i < n; i++) { - cblas_caxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(caxpby)(&m, alpha, aptr, &one, beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -116,9 +117,11 @@ static float check_cgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, beta, data_cgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, beta, data_cgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by cgeadd and sgemm return smatrix_difference(data_cgeadd.c_test, data_cgeadd.c_verify, cols, rows, ldc*2); @@ -150,9 +153,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, beta, data_cgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, beta, data_cgeadd.c_test, ldc); +#endif return check_error(); } @@ -419,7 +424,7 @@ CTEST(cgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } - +#ifndef NO_CBLAS /** * C API specific test * Test cgeadd by comparing it against sgemm @@ -877,4 +882,5 @@ CTEST(cgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_cgemm.c b/utest/test_extensions/test_cgemm.c index cd38d710..15d64e37 100644 --- a/utest/test_extensions/test_cgemm.c +++ b/utest/test_extensions/test_cgemm.c @@ -73,9 +73,10 @@ static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint float alpha_conj[] = {1.0f, 0.0f}; char transa_verify = transa; char transb_verify = transb; + char cc[2]="C", cr[2]="R"; - int arows = k, acols = m; - int brows = n, bcols = k; + blasint arows = k, acols = m; + blasint brows = n, bcols = k; if (transa == 'T' || transa == 'C'){ arows = m; acols = k; @@ -99,12 +100,12 @@ static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint data_cgemm.c_verify[i] = data_cgemm.c_test[i]; if (transa == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_cgemm.a_verify, lda, lda); + BLASFUNC(cimatcopy)(cc, cr, &arows, &acols, alpha_conj, data_cgemm.a_verify, &lda, &lda); transa_verify = 'N'; } if (transb == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_cgemm.b_verify, ldb, ldb); + BLASFUNC(cimatcopy)(cc, cr, &brows, &bcols, alpha_conj, data_cgemm.b_verify, &ldb, &ldb); transb_verify = 'N'; } @@ -270,4 +271,4 @@ CTEST(cgemm, transa_conjnotransb) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c index ed927993..dfeb06ff 100644 --- a/utest/test_extensions/test_cgemmt.c +++ b/utest/test_extensions/test_cgemmt.c @@ -73,9 +73,11 @@ static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(cgemm)(&transa, &transb, &m, &m, &k, alpha, data_cgemmt.a_test, &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_cgemm(order, transa, transb, m, m, k, alpha, data_cgemmt.a_test, lda, data_cgemmt.b_test, ldb, beta, data_cgemmt.c_gemm, ldc); +#endif ldc *= 2; @@ -160,9 +162,11 @@ static float check_cgemmt(char api, enum CBLAS_ORDER order, char uplo, char tran if (api == 'F') BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc * 2; i++) data_cgemmt.c_verify[i] -= data_cgemmt.c_test[i]; @@ -197,9 +201,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); +#endif return check_error(); } @@ -680,6 +686,7 @@ CTEST(cgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test cgemmt by comparing it against sgemm @@ -1591,6 +1598,7 @@ CTEST(cgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Fortran API specific test @@ -1736,6 +1744,7 @@ CTEST(cgemmt, xerbla_ldc_invalid) ASSERT_EQUAL(TRUE, passed); } +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -2007,4 +2016,5 @@ CTEST(cgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c index cb4e5ad9..dd95d32b 100644 --- a/utest/test_extensions/test_cgemv_t.c +++ b/utest/test_extensions/test_cgemv_t.c @@ -65,6 +65,7 @@ static struct DATA_CGEMV_T data_cgemv_t; static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) { blasint i; + blasint one=1; float *a_ptr = data_cgemv_t.a_verify; float *x_ptr = data_cgemv_t.x_test; float *x_res = data_cgemv_t.x_verify; @@ -73,7 +74,11 @@ static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc for (i = 0; i < n * inc_x; i+= inc_x) { - result = cblas_cdotu(lda, a_ptr, 1, x_ptr, inc_x); +#ifdef RETURN_BY_STACK + BLASFUNC(cdotu)(&result, &lda, a_ptr, &one, x_ptr, &inc_x); +#else + result = BLASFUNC(cdotu)(&lda, a_ptr, &one, x_ptr, &inc_x); +#endif x_res[0] = CREAL(result); x_res[1] = CIMAG(result); a_ptr += lda * 2; @@ -153,6 +158,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_t.a_test, &lda, data_cgemv_t.x_test, &inc_x, beta, data_cgemv_t.y_test, &inc_y); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -173,13 +179,14 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, cblas_cgemv(corder, ctrans, m, n, alpha, data_cgemv_t.a_test, lda, data_cgemv_t.x_test, inc_x, beta, data_cgemv_t.y_test, inc_y); } +#endif // Find the differences between output vector caculated by cgemv and reference funcs for (i = 0; i < m * inc_y * 2; i++) data_cgemv_t.y_test[i] -= data_cgemv_t.y_verify[i]; // Find the norm of differences - return cblas_scnrm2(m, data_cgemv_t.y_test, inc_y); + return BLASFUNC(scnrm2)(&m, data_cgemv_t.y_test, &inc_y); } /** @@ -213,6 +220,7 @@ static int check_badargs(char order, char trans, blasint m, blasint n, return check_error(); } +#ifndef NO_CBLAS /** * C API specific function * Check if error function was called with expected function name @@ -1130,3 +1138,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major) ASSERT_EQUAL(TRUE, passed); } #endif +#endif diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c index a4b1e30a..0c96a3b1 100644 --- a/utest/test_extensions/test_cimatcopy.c +++ b/utest/test_extensions/test_cimatcopy.c @@ -98,6 +98,7 @@ static float check_cimatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -108,6 +109,7 @@ static float check_cimatcopy(char api, char order, char trans, blasint rows, bla cblas_cimatcopy(corder, ctrans, rows, cols, alpha, data_cimatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by cimatcopy and reference func return smatrix_difference(data_cimatcopy.a_test, data_cimatcopy.a_verify, cols_out, rows_out, 2*lda_dst); @@ -502,6 +504,7 @@ CTEST(cimatcopy, rowmajor_conjtrans_col_50_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test cimatcopy by comparing it against reference @@ -681,6 +684,7 @@ CTEST(cimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -815,4 +819,4 @@ CTEST(cimatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c index 71663406..b493c93a 100644 --- a/utest/test_extensions/test_comatcopy.c +++ b/utest/test_extensions/test_comatcopy.c @@ -99,6 +99,7 @@ static float check_comatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, &lda, data_comatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -109,6 +110,7 @@ static float check_comatcopy(char api, char order, char trans, blasint rows, bla cblas_comatcopy(corder, ctrans, rows, cols, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_test, ldb); } +#endif return smatrix_difference(data_comatcopy.b_test, data_comatcopy.b_verify, b_cols, b_rows, ldb*2); } @@ -316,6 +318,7 @@ CTEST(comatcopy, rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test comatcopy by comparing it against refernce @@ -491,6 +494,7 @@ CTEST(comatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_crot.c b/utest/test_extensions/test_crot.c index 1c55216d..1ff45681 100644 --- a/utest/test_extensions/test_crot.c +++ b/utest/test_extensions/test_crot.c @@ -107,6 +107,7 @@ static float check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, floa return (norm / 2); } +#ifndef NO_CBLAS /** * C API specific function * Comapare results computed by csrot and caxpby @@ -789,4 +790,5 @@ CTEST(crot, c_api_check_n_zero) float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c index 84875ccf..bb23a5a0 100644 --- a/utest/test_extensions/test_crotg.c +++ b/utest/test_extensions/test_crotg.c @@ -161,7 +161,7 @@ CTEST(crotg, negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); } - +#ifndef NO_CBLAS /** * C API specific test * Test crotg by comparing it against pre-calculated values @@ -287,4 +287,5 @@ CTEST(crotg, c_api_negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_cscal.c b/utest/test_extensions/test_cscal.c index 009c600a..cf8b3559 100644 --- a/utest/test_extensions/test_cscal.c +++ b/utest/test_extensions/test_cscal.c @@ -91,8 +91,10 @@ static float check_cscal(char api, blasint n, float *alpha, blasint inc) if(api == 'F') BLASFUNC(cscal)(&n, alpha, data_cscal.x_test, &inc); +#ifndef NO_CBLAS else cblas_cscal(n, alpha, data_cscal.x_test, inc); +#endif // Find the differences between output vector computed by cscal and cscal_trusted for (i = 0; i < n * 2 * inc; i++) @@ -132,6 +134,7 @@ CTEST(cscal, alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test cscal by comparing it against reference @@ -161,4 +164,5 @@ CTEST(cscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_ctrmv.c b/utest/test_extensions/test_ctrmv.c index 2a3f2741..4c61c31c 100644 --- a/utest/test_extensions/test_ctrmv.c +++ b/utest/test_extensions/test_ctrmv.c @@ -65,6 +65,7 @@ static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint ld blasint i; float alpha_conj[] = {1.0f, 0.0f}; char trans_verify = trans; + char cc[2]="C", cr[2]="R"; srand_generate(data_ctrmv.a_test, n * lda * 2); srand_generate(data_ctrmv.x_test, n * incx * 2); @@ -76,7 +77,7 @@ static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint ld data_ctrmv.x_verify[i] = data_ctrmv.x_test[i]; if (trans == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ctrmv.a_verify, lda, lda); + BLASFUNC(cimatcopy)(cc, cr, &n, &n, alpha_conj, data_ctrmv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -263,4 +264,4 @@ CTEST(ctrmv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_ctrsv.c b/utest/test_extensions/test_ctrsv.c index 0e639bb2..7298ba42 100644 --- a/utest/test_extensions/test_ctrsv.c +++ b/utest/test_extensions/test_ctrsv.c @@ -65,6 +65,7 @@ static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint ld blasint i; float alpha_conj[] = {1.0f, 0.0f}; char trans_verify = trans; + char cc[2]="C", cr[2]="R"; srand_generate(data_ctrsv.a_test, n * lda * 2); srand_generate(data_ctrsv.x_test, n * incx * 2); @@ -76,8 +77,8 @@ static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint ld data_ctrsv.x_verify[i] = data_ctrsv.x_test[i]; if (trans == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, - alpha_conj, data_ctrsv.a_verify, lda, lda); + BLASFUNC(cimatcopy)(cc, cr, &n, &n, + alpha_conj, data_ctrsv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -264,4 +265,4 @@ CTEST(ctrsv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c index 736921fa..50bc5a92 100644 --- a/utest/test_extensions/test_damin.c +++ b/utest/test_extensions/test_damin.c @@ -351,4 +351,4 @@ CTEST(damin, negative_step_2_N_70){ double amin = BLASFUNC(damin)(&N, x, &inc); ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_daxpby.c b/utest/test_extensions/test_daxpby.c index 6e77c7c7..93b26810 100644 --- a/utest/test_extensions/test_daxpby.c +++ b/utest/test_extensions/test_daxpby.c @@ -97,6 +97,7 @@ static double check_daxpby(blasint n, double alpha, blasint incx, double beta, b return BLASFUNC(dnrm2)(&n, data_daxpby.y_test, &incy_abs); } +#ifndef NO_CBLAS /** * C API specific function * Test daxpby by comparing it with dscal and daxpy. @@ -142,7 +143,7 @@ static double c_api_check_daxpby(blasint n, double alpha, blasint incx, double b // Find the norm of differences return cblas_dnrm2(n, data_daxpby.y_test, incy_abs); } - +#endif /** * Fortran API specific test * Test daxpby by comparing it with dscal and daxpy. @@ -468,6 +469,7 @@ CTEST(daxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test daxpby by comparing it with dscal and daxpy. @@ -796,4 +798,5 @@ CTEST(daxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c index 8f93a842..20e8d966 100644 --- a/utest/test_extensions/test_dgeadd.c +++ b/utest/test_extensions/test_dgeadd.c @@ -62,10 +62,11 @@ static void dgeadd_trusted(blasint m, blasint n, double alpha, double *aptr, blasint lda, double beta, double *cptr, blasint ldc) { blasint i; + blasint one=1; for (i = 0; i < n; i++) { - cblas_daxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(daxpby)(&m, &alpha, aptr, &one, &beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -113,9 +114,11 @@ static double check_dgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, &beta, data_dgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, beta, data_dgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by dgeadd and sgemm return dmatrix_difference(data_dgeadd.c_test, data_dgeadd.c_verify, cols, rows, ldc); @@ -147,9 +150,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, &beta, data_dgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, beta, data_dgeadd.c_test, ldc); +#endif return check_error(); } @@ -417,6 +422,7 @@ CTEST(dgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dgeadd by comparing it against reference @@ -875,4 +881,5 @@ CTEST(dgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c index 22dcaf2a..fd8f5f66 100644 --- a/utest/test_extensions/test_dgemmt.c +++ b/utest/test_extensions/test_dgemmt.c @@ -73,9 +73,11 @@ static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(dgemm)(&transa, &transb, &m, &m, &k, &alpha, data_dgemmt.a_test, &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc); +#endif if (uplo == 'L' || uplo == CblasLower) { @@ -152,9 +154,11 @@ static double check_dgemmt(char api, enum CBLAS_ORDER order, char uplo, char tra if (api == 'F') BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc; i++) data_dgemmt.c_verify[i] -= data_dgemmt.c_test[i]; @@ -189,9 +193,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); +#endif return check_error(); } @@ -480,6 +486,7 @@ CTEST(dgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dgemmt by comparing it against dgemm @@ -1023,6 +1030,7 @@ CTEST(dgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Fortran API specific test @@ -1168,6 +1176,7 @@ CTEST(dgemmt, xerbla_ldc_invalid) ASSERT_EQUAL(TRUE, passed); } +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -1439,4 +1448,5 @@ CTEST(dgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c index 811c356b..eebb7669 100644 --- a/utest/test_extensions/test_dimatcopy.c +++ b/utest/test_extensions/test_dimatcopy.c @@ -93,6 +93,7 @@ static double check_dimatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -103,6 +104,7 @@ static double check_dimatcopy(char api, char order, char trans, blasint rows, bl cblas_dimatcopy(corder, ctrans, rows, cols, alpha, data_dimatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by dimatcopy and reference func return dmatrix_difference(data_dimatcopy.a_test, data_dimatcopy.a_verify, cols_out, rows_out, lda_dst); @@ -687,6 +689,7 @@ CTEST(dimatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dimatcopy by comparing it against reference @@ -778,6 +781,7 @@ CTEST(dimatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -912,4 +916,4 @@ CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c index e60b9c83..e892271d 100644 --- a/utest/test_extensions/test_domatcopy.c +++ b/utest/test_extensions/test_domatcopy.c @@ -94,6 +94,7 @@ static double check_domatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, &lda, data_domatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -104,6 +105,7 @@ static double check_domatcopy(char api, char order, char trans, blasint rows, bl cblas_domatcopy(corder, ctrans, rows, cols, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_test, ldb); } +#endif return dmatrix_difference(data_domatcopy.b_test, data_domatcopy.b_verify, b_cols, b_rows, ldb); } @@ -412,6 +414,7 @@ CTEST(domatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test domatcopy by comparing it against refernce @@ -503,6 +506,7 @@ CTEST(domatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_drotmg.c b/utest/test_extensions/test_drotmg.c index 3073c8e3..3755776c 100644 --- a/utest/test_extensions/test_drotmg.c +++ b/utest/test_extensions/test_drotmg.c @@ -224,6 +224,7 @@ CTEST(drotmg, scaled_y_greater_than_scaled_x) } } +#ifndef NO_CBLAS /** * C API specific test * Test drotmg by comparing it against pre-calculated values @@ -411,4 +412,5 @@ CTEST(drotmg, c_api_scaled_y_greater_than_scaled_x) ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dsum.c b/utest/test_extensions/test_dsum.c index e987c5a4..c7e6d956 100644 --- a/utest/test_extensions/test_dsum.c +++ b/utest/test_extensions/test_dsum.c @@ -62,6 +62,7 @@ CTEST(dsum, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; + x[0]=0.; for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } @@ -220,6 +221,7 @@ CTEST(dsum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dsum by comparing it against pre-calculated values @@ -243,6 +245,7 @@ CTEST(dsum, c_api_step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; + x[0]=0.; for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } @@ -400,4 +403,5 @@ CTEST(dsum, c_api_step_2_N_50){ double sum = cblas_dsum(N, x, inc); ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dzsum.c b/utest/test_extensions/test_dzsum.c index 5139f59c..318d7fbe 100644 --- a/utest/test_extensions/test_dzsum.c +++ b/utest/test_extensions/test_dzsum.c @@ -62,6 +62,7 @@ CTEST(dzsum, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; + x[0] = 0.0; for (i = 0; i < N * inc * 2; i ++) { x[i] = i + 1000; } @@ -220,6 +221,7 @@ CTEST(dzsum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dzsum by comparing it against pre-calculated values @@ -243,6 +245,7 @@ CTEST(dzsum, c_api_step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; + x[0] = 0.0; for (i = 0; i < N * inc * 2; i ++) { x[i] = i + 1000; } @@ -400,4 +403,5 @@ CTEST(dzsum, c_api_step_2_N_50){ double sum = cblas_dzsum(N, x, inc); ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_icamin.c b/utest/test_extensions/test_icamin.c index cca464ea..8ac0844a 100644 --- a/utest/test_extensions/test_icamin.c +++ b/utest/test_extensions/test_icamin.c @@ -331,6 +331,7 @@ CTEST(icamin, min_idx_in_vec_tail){ ASSERT_EQUAL(N, index); } +#ifndef NO_CBLAS /** * C API specific test * Test icamin by comparing it against pre-calculated values @@ -622,4 +623,5 @@ CTEST(icamin, c_api_min_idx_in_vec_tail){ blasint index = cblas_icamin(N, x, inc); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c index bebe76db..4bee258a 100644 --- a/utest/test_extensions/test_idamin.c +++ b/utest/test_extensions/test_idamin.c @@ -413,6 +413,7 @@ CTEST(idamin, min_idx_in_vec_tail_inc_1){ ASSERT_EQUAL(N, index); } +#ifndef NO_CBLAS /** * C API specific test * Test idamin by comparing it against pre-calculated values @@ -787,3 +788,4 @@ CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ ASSERT_EQUAL(N - 1, index); } #endif +#endif diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index d93813e6..a4a41847 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -412,7 +412,7 @@ CTEST(isamin, min_idx_in_vec_tail_inc_1){ free(x); ASSERT_EQUAL(N, index); } - +#ifndef NO_CBLAS /** * C API specific test * Test isamin by comparing it against pre-calculated values @@ -787,3 +787,4 @@ CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ ASSERT_EQUAL(N - 1, index); } #endif +#endif diff --git a/utest/test_extensions/test_izamin.c b/utest/test_extensions/test_izamin.c index a0bdae8e..8c923c60 100644 --- a/utest/test_extensions/test_izamin.c +++ b/utest/test_extensions/test_izamin.c @@ -331,6 +331,7 @@ CTEST(izamin, min_idx_in_vec_tail){ ASSERT_EQUAL(N, index); } +#ifndef NO_CBLAS /** * C API specific test * Test izamin by comparing it against pre-calculated values @@ -623,3 +624,4 @@ CTEST(izamin, c_api_min_idx_in_vec_tail){ ASSERT_EQUAL(N - 1, index); } #endif +#endif diff --git a/utest/test_extensions/test_saxpby.c b/utest/test_extensions/test_saxpby.c index b4bd5cf0..44f89240 100644 --- a/utest/test_extensions/test_saxpby.c +++ b/utest/test_extensions/test_saxpby.c @@ -96,6 +96,7 @@ static float check_saxpby(blasint n, float alpha, blasint incx, float beta, blas return BLASFUNC(snrm2)(&n, data_saxpby.y_test, &incy_abs); } +#ifndef NO_CBLAS /** * C API specific function * Test saxpby by comparing it with sscal and saxpy. @@ -141,6 +142,7 @@ static float c_api_check_saxpby(blasint n, float alpha, blasint incx, float beta // Find the norm of differences return cblas_snrm2(n, data_saxpby.y_test, incy_abs); } +#endif /** * Fortran API specific test @@ -467,6 +469,7 @@ CTEST(saxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test saxpby by comparing it with sscal and saxpy. @@ -791,4 +794,5 @@ CTEST(saxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_scsum.c b/utest/test_extensions/test_scsum.c index 492e1a4c..71628084 100644 --- a/utest/test_extensions/test_scsum.c +++ b/utest/test_extensions/test_scsum.c @@ -62,6 +62,7 @@ CTEST(scsum, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; + x[0] = 0.0f; for (i = 0; i < N * inc * 2; i ++) { x[i] = i + 1000; } @@ -220,6 +221,7 @@ CTEST(scsum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test scsum by comparing it against pre-calculated values @@ -243,6 +245,7 @@ CTEST(scsum, c_api_step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; + x[0] = 0.0f; for (i = 0; i < N * inc * 2; i ++) { x[i] = i + 1000; } @@ -400,4 +403,5 @@ CTEST(scsum, c_api_step_2_N_50){ float sum = cblas_scsum(N, x, inc); ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c index 171132b9..009066a9 100644 --- a/utest/test_extensions/test_sgeadd.c +++ b/utest/test_extensions/test_sgeadd.c @@ -63,10 +63,10 @@ static void sgeadd_trusted(blasint m, blasint n, float alpha, float *aptr, blasint lda, float beta, float *cptr, blasint ldc) { blasint i; - + blasint one=1; for (i = 0; i < n; i++) { - cblas_saxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(saxpby)(&m, &alpha, aptr, &one, &beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -115,9 +115,11 @@ static float check_sgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, BLASFUNC(sgeadd) (&m, &n, &alpha, data_sgeadd.a_test, &lda, &beta, data_sgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, beta, data_sgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by sgeadd and sgemm return smatrix_difference(data_sgeadd.c_test, data_sgeadd.c_verify, cols, rows, ldc); @@ -150,9 +152,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, BLASFUNC(sgeadd) (&m, &n, &alpha, data_sgeadd.a_test, &lda, &beta, data_sgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, beta, data_sgeadd.c_test, ldc); +#endif return check_error(); } @@ -420,6 +424,7 @@ CTEST(sgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test sgeadd by comparing it against reference @@ -877,4 +882,5 @@ CTEST(sgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c index 5b51e357..177ce0d7 100644 --- a/utest/test_extensions/test_sgemmt.c +++ b/utest/test_extensions/test_sgemmt.c @@ -73,9 +73,11 @@ static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(sgemm)(&transa, &transb, &m, &m, &k, &alpha, data_sgemmt.a_test, &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc); +#endif if (uplo == 'L' || uplo == CblasLower) { @@ -152,9 +154,11 @@ static float check_sgemmt(char api, enum CBLAS_ORDER order, char uplo, char tran if (api == 'F') BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc; i++) data_sgemmt.c_verify[i] -= data_sgemmt.c_test[i]; @@ -189,9 +193,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); +#endif return check_error(); } @@ -480,6 +486,7 @@ CTEST(sgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test sgemmt by comparing it against sgemm @@ -1023,6 +1030,7 @@ CTEST(sgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Fortran API specific test @@ -1168,6 +1176,7 @@ CTEST(sgemmt, xerbla_ldc_invalid) ASSERT_EQUAL(TRUE, passed); } +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -1439,4 +1448,5 @@ CTEST(sgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c index ba388596..c00ea0c8 100644 --- a/utest/test_extensions/test_simatcopy.c +++ b/utest/test_extensions/test_simatcopy.c @@ -93,6 +93,7 @@ static float check_simatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -103,6 +104,7 @@ static float check_simatcopy(char api, char order, char trans, blasint rows, bla cblas_simatcopy(corder, ctrans, rows, cols, alpha, data_simatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by simatcopy and reference func return smatrix_difference(data_simatcopy.a_test, data_simatcopy.a_verify, cols_out, rows_out, lda_dst); @@ -687,6 +689,7 @@ CTEST(simatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test simatcopy by comparing it against reference @@ -778,6 +781,7 @@ CTEST(simatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -912,4 +916,4 @@ CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c index b53c7cae..62a6056d 100644 --- a/utest/test_extensions/test_somatcopy.c +++ b/utest/test_extensions/test_somatcopy.c @@ -94,6 +94,7 @@ static float check_somatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, &lda, data_somatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -104,7 +105,8 @@ static float check_somatcopy(char api, char order, char trans, blasint rows, bla cblas_somatcopy(corder, ctrans, rows, cols, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_test, ldb); } - +#endif + return smatrix_difference(data_somatcopy.b_test, data_somatcopy.b_verify, b_cols, b_rows, ldb); } @@ -412,6 +414,7 @@ CTEST(somatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test somatcopy by comparing it against refernce @@ -503,6 +506,7 @@ CTEST(somatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_srotmg.c b/utest/test_extensions/test_srotmg.c index 3c97e3b4..f0422d2b 100644 --- a/utest/test_extensions/test_srotmg.c +++ b/utest/test_extensions/test_srotmg.c @@ -224,6 +224,7 @@ CTEST(srotmg, scaled_y_greater_than_scaled_x) } } +#ifndef NO_CBLAS /** * C API specific test * Test srotmg by comparing it against pre-calculated values @@ -411,4 +412,5 @@ CTEST(srotmg, c_api_scaled_y_greater_than_scaled_x) ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); } } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_ssum.c b/utest/test_extensions/test_ssum.c index 971a0d2e..5b20c0b0 100644 --- a/utest/test_extensions/test_ssum.c +++ b/utest/test_extensions/test_ssum.c @@ -62,6 +62,7 @@ CTEST(ssum, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; + x[0] = 0.0f; for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } @@ -220,6 +221,7 @@ CTEST(ssum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test ssum by comparing it against pre-calculated values @@ -243,6 +245,7 @@ CTEST(ssum, c_api_step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; + x[0] = 0.0f; for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } @@ -400,4 +403,5 @@ CTEST(ssum, c_api_step_2_N_50){ float sum = cblas_ssum(N, x, inc); ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zaxpby.c b/utest/test_extensions/test_zaxpby.c index 6148f44c..d6ca9994 100644 --- a/utest/test_extensions/test_zaxpby.c +++ b/utest/test_extensions/test_zaxpby.c @@ -96,6 +96,7 @@ static double check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, return BLASFUNC(dznrm2)(&n, data_zaxpby.y_test, &incy_abs); } +#ifndef NO_CBLAS /** * C API specific function * Test zaxpby by comparing it with zscal and zaxpy. @@ -145,6 +146,7 @@ static double c_api_check_zaxpby(blasint n, double *alpha, blasint incx, double // Find the norm of differences return cblas_dznrm2(n, data_zaxpby.y_test, incy_abs); } +#endif /** * Fortran API specific test @@ -387,6 +389,7 @@ CTEST(zaxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zaxpby by comparing it with zscal and zaxpy. @@ -628,3 +631,4 @@ CTEST(zaxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c index 7496ccf8..466b94a5 100644 --- a/utest/test_extensions/test_zgeadd.c +++ b/utest/test_extensions/test_zgeadd.c @@ -62,13 +62,14 @@ static void zgeadd_trusted(blasint m, blasint n, double *alpha, double *aptr, blasint lda, double *beta, double *cptr, blasint ldc) { blasint i; + blasint one=1; lda *= 2; ldc *= 2; for (i = 0; i < n; i++) { - cblas_zaxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(zaxpby)(&m, alpha, aptr, &one, beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -116,9 +117,11 @@ static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, beta, data_zgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, beta, data_zgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by zgeadd and sgemm return dmatrix_difference(data_zgeadd.c_test, data_zgeadd.c_verify, cols, rows, ldc * 2); @@ -150,9 +153,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, beta, data_zgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, beta, data_zgeadd.c_test, ldc); +#endif return check_error(); } @@ -420,6 +425,7 @@ CTEST(zgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zgeadd by comparing it against reference @@ -877,4 +883,5 @@ CTEST(zgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zgemm.c b/utest/test_extensions/test_zgemm.c index 4160a508..bd23ebca 100644 --- a/utest/test_extensions/test_zgemm.c +++ b/utest/test_extensions/test_zgemm.c @@ -73,9 +73,10 @@ static double check_zgemm(char transa, char transb, blasint m, blasint n, blasin double alpha_conj[] = {1.0, 0.0}; char transa_verify = transa; char transb_verify = transb; + char cc[2]="C", cr[2]="R"; - int arows = k, acols = m; - int brows = n, bcols = k; + blasint arows = k, acols = m; + blasint brows = n, bcols = k; if (transa == 'T' || transa == 'C'){ arows = m; acols = k; @@ -99,12 +100,12 @@ static double check_zgemm(char transa, char transb, blasint m, blasint n, blasin data_zgemm.c_verify[i] = data_zgemm.c_test[i]; if (transa == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_zgemm.a_verify, lda, lda); + BLASFUNC(zimatcopy)(cc, cr, &arows, &acols, alpha_conj, data_zgemm.a_verify, &lda, &lda); transa_verify = 'N'; } if (transb == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_zgemm.b_verify, ldb, ldb); + BLASFUNC(zimatcopy)(cc, cr, &brows, &bcols, alpha_conj, data_zgemm.b_verify, &ldb, &ldb); transb_verify = 'N'; } @@ -270,4 +271,4 @@ CTEST(zgemm, transa_conjnotransb) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c index c5538100..34b8b618 100644 --- a/utest/test_extensions/test_zgemmt.c +++ b/utest/test_extensions/test_zgemmt.c @@ -73,9 +73,11 @@ static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(zgemm)(&transa, &transb, &m, &m, &k, alpha, data_zgemmt.a_test, &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_zgemm(order, transa, transb, m, m, k, alpha, data_zgemmt.a_test, lda, data_zgemmt.b_test, ldb, beta, data_zgemmt.c_gemm, ldc); +#endif ldc *= 2; @@ -160,9 +162,11 @@ static double check_zgemmt(char api, enum CBLAS_ORDER order, char uplo, char tra if (api == 'F') BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc * 2; i++) data_zgemmt.c_verify[i] -= data_zgemmt.c_test[i]; @@ -197,9 +201,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); +#endif return check_error(); } @@ -680,6 +686,7 @@ CTEST(zgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zgemmt by comparing it against sgemm @@ -1591,6 +1598,7 @@ CTEST(zgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Fortran API specific test @@ -1735,7 +1743,7 @@ CTEST(zgemmt, xerbla_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } - +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -2007,4 +2015,5 @@ CTEST(zgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c index b2d0b271..4e419ad1 100644 --- a/utest/test_extensions/test_zgemv_t.c +++ b/utest/test_extensions/test_zgemv_t.c @@ -65,6 +65,7 @@ static struct DATA_ZGEMV_T data_zgemv_t; static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) { blasint i; + blasint one=1; double *a_ptr = data_zgemv_t.a_verify; double *x_ptr = data_zgemv_t.x_test; double *x_res = data_zgemv_t.x_verify; @@ -73,8 +74,12 @@ static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc for (i = 0; i < n * inc_x; i += inc_x) { - result = cblas_zdotu(lda, a_ptr, 1, x_ptr, inc_x); - x_res[0] = CREAL(result); +#ifdef RETURN_BY_STACK + BLASFUNC(zdotu)(&result, &lda, a_ptr, &one, x_ptr, &inc_x); +#else + result = BLASFUNC(zdotu)(&lda, a_ptr, &one, x_ptr, &inc_x); +#endif + x_res[0] = CREAL(result); x_res[1] = CIMAG(result); a_ptr += lda * 2; x_res += 2 * inc_x; @@ -157,6 +162,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_t.a_test, &lda, data_zgemv_t.x_test, &inc_x, beta, data_zgemv_t.y_test, &inc_y); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -177,13 +183,14 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n cblas_zgemv(corder, ctrans, m, n, alpha, data_zgemv_t.a_test, lda, data_zgemv_t.x_test, inc_x, beta, data_zgemv_t.y_test, inc_y); } +#endif // Find the differences between output vector caculated by zgemv and reference funcs for (i = 0; i < m * inc_y * 2; i++) data_zgemv_t.y_test[i] -= data_zgemv_t.y_verify[i]; // Find the norm of differences - return cblas_dznrm2(m, data_zgemv_t.y_test, inc_y); + return BLASFUNC(dznrm2)(&m, data_zgemv_t.y_test, &inc_y); } /** @@ -217,7 +224,7 @@ static int check_badargs(char order, char trans, blasint m, blasint n, return check_error(); } - +#ifndef NO_CBLAS /** * C API specific function * Check if error function was called with expected function name @@ -1134,3 +1141,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major) ASSERT_EQUAL(TRUE, passed); } #endif +#endif diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c index 8376bc49..86bc4670 100644 --- a/utest/test_extensions/test_zimatcopy.c +++ b/utest/test_extensions/test_zimatcopy.c @@ -98,6 +98,7 @@ static double check_zimatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -108,6 +109,7 @@ static double check_zimatcopy(char api, char order, char trans, blasint rows, bl cblas_zimatcopy(corder, ctrans, rows, cols, alpha, data_zimatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by zimatcopy and reference func return dmatrix_difference(data_zimatcopy.a_test, data_zimatcopy.a_verify, cols_out, rows_out, lda_dst*2); @@ -502,6 +504,7 @@ CTEST(zimatcopy, rowmajor_conjtrans_col_50_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zimatcopy by comparing it against reference @@ -681,6 +684,7 @@ CTEST(zimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -815,4 +819,4 @@ CTEST(zimatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c index 495831c5..208cfd98 100644 --- a/utest/test_extensions/test_zomatcopy.c +++ b/utest/test_extensions/test_zomatcopy.c @@ -99,6 +99,7 @@ static double check_zomatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, &lda, data_zomatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -109,7 +110,8 @@ static double check_zomatcopy(char api, char order, char trans, blasint rows, bl cblas_zomatcopy(corder, ctrans, rows, cols, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_test, ldb); } - +#endif + return dmatrix_difference(data_zomatcopy.b_test, data_zomatcopy.b_verify, b_cols, b_rows, ldb*2); } @@ -325,6 +327,7 @@ CTEST(zomatcopy, rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zomatcopy by comparing it against refernce @@ -508,6 +511,7 @@ CTEST(zomatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_zrot.c b/utest/test_extensions/test_zrot.c index 5471e051..c5ae22fc 100644 --- a/utest/test_extensions/test_zrot.c +++ b/utest/test_extensions/test_zrot.c @@ -105,6 +105,7 @@ static double check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, do return (norm / 2); } +#ifndef NO_CBLAS /** * C API specific function * Comapare results computed by zdrot and zaxpby @@ -787,4 +788,5 @@ CTEST(zrot, c_api_check_n_zero) double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c index 1de95447..c834bed6 100644 --- a/utest/test_extensions/test_zrotg.c +++ b/utest/test_extensions/test_zrotg.c @@ -162,6 +162,7 @@ CTEST(zrotg, negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zrotg by comparing it against pre-calculated values @@ -287,4 +288,5 @@ CTEST(zrotg, c_api_negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zscal.c b/utest/test_extensions/test_zscal.c index 132f4ee5..63cf355a 100644 --- a/utest/test_extensions/test_zscal.c +++ b/utest/test_extensions/test_zscal.c @@ -92,8 +92,10 @@ static double check_zscal(char api, blasint n, double *alpha, blasint inc) if(api == 'F') BLASFUNC(zscal)(&n, alpha, data_zscal.x_test, &inc); +#ifndef NO_CBLAS else cblas_zscal(n, alpha, data_zscal.x_test, inc); +#endif // Find the differences between output vector computed by zscal and zscal_trusted for (i = 0; i < n * 2 * inc; i++) @@ -133,6 +135,7 @@ CTEST(zscal, alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zscal by comparing it against reference @@ -162,4 +165,5 @@ CTEST(zscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c index 5668ec29..5819877b 100644 --- a/utest/test_extensions/test_ztrmv.c +++ b/utest/test_extensions/test_ztrmv.c @@ -65,7 +65,7 @@ static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint l blasint i; double alpha_conj[] = {1.0, 0.0}; char trans_verify = trans; - + char cc[2]="C", cr[2]="R"; drand_generate(data_ztrmv.a_test, n * lda * 2); drand_generate(data_ztrmv.x_test, n * incx * 2); @@ -76,7 +76,7 @@ static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint l data_ztrmv.x_verify[i] = data_ztrmv.x_test[i]; if (trans == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ztrmv.a_verify, lda, lda); + BLASFUNC(zimatcopy)(cc, cr, &n, &n, alpha_conj, data_ztrmv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -263,4 +263,4 @@ CTEST(ztrmv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c index 4b7ec6aa..5db7963e 100644 --- a/utest/test_extensions/test_ztrsv.c +++ b/utest/test_extensions/test_ztrsv.c @@ -65,6 +65,7 @@ static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint l blasint i; double alpha_conj[] = {1.0, 0.0}; char trans_verify = trans; + char cc[2]="C", cr[2]="R"; drand_generate(data_ztrsv.a_test, n * lda * 2); drand_generate(data_ztrsv.x_test, n * incx * 2); @@ -76,8 +77,8 @@ static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint l data_ztrsv.x_verify[i] = data_ztrsv.x_test[i]; if (trans == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, - alpha_conj, data_ztrsv.a_verify, lda, lda); + BLASFUNC(zimatcopy)(cc, cr, &n, &n, + alpha_conj, data_ztrsv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -264,4 +265,4 @@ CTEST(ztrsv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/utest_main2.c b/utest/test_extensions/utest_main2.c new file mode 100644 index 00000000..41269f0e --- /dev/null +++ b/utest/test_extensions/utest_main2.c @@ -0,0 +1,712 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include +#include + + +#define CTEST_MAIN +#define CTEST_SEGFAULT +#define CTEST_ADD_TESTS_MANUALLY + +#include "cblas.h" +#include "utest/openblas_utest.h" +#if 1 +CTEST(amax, samax){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, 2.2, -3.3}; + fprintf(stderr,"testing samax\n"); + te_max=BLASFUNC(samax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + +CTEST(amax, damax){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={-1.1, 2.2, -3.3}; + + fprintf(stderr,"testing damax\n"); + te_max=BLASFUNC(damax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} + +CTEST (drotmg,rotmg) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i=0; + // original test case for libGoto bug fixed by feb2014 rewrite + te_d1= 0.21149573940783739; + te_d2= 0.046892057172954082; + te_x1= -0.42272687517106533; + te_y1= 0.42211309121921659; + + + for(i=0; i<5; i++){ + te_param[i]=tr_param[i]=0.0; + } + + //reference values as calculated by netlib blas + + tr_d1= 0.1732048; + tr_d2= 0.03840234; + tr_x1= -0.516180; + tr_y1= 0.422113; + tr_d1= 0.17320483687975; + tr_d2= 0.03840233915037; + tr_x1= -0.51618034832329; + tr_y1= 0.42211309121922; + + tr_param[0]= 0.0; + tr_param[1]= 0.0; + tr_param[2]= 0.99854803659786; + tr_param[3]= -0.22139439665872; + tr_param[4]= 0.0; + + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +CTEST (drotmg,rotmg_issue1452) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i=0; + + // from issue #1452 + te_d1 = 5.9e-8; + te_d2 = 5.960464e-8; + te_x1 = 1.0; + te_y1 = 150.0; + + for(i=0; i<5; i++){ + te_param[i]=tr_param[i]=0.0; + } + te_param[3]=1./4096.; + //reference values as calculated by gonum blas with rotmg rewritten to Hopkins' algorithm + tr_d1= 0.99995592822897; + tr_d2= 0.98981219860583; + tr_x1= 0.03662270484346; + tr_y1= 150.000000000000; + + tr_param[0]= -1.0; + tr_param[1]= 0.00000161109346; + tr_param[2]= -0.00024414062500; + tr_param[3]= 0.00024414062500; + tr_param[4]= 0.00000162760417; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } + +} + +CTEST(drotmg, rotmg_D1eqD2_X1eqX2) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i=0; + te_d1= tr_d1=2.; + te_d2= tr_d2=2.; + te_x1= tr_x1=8.; + te_y1= tr_y1=8.; + + for(i=0; i<5; i++){ + te_param[i]=tr_param[i]=0.0; + } + + //reference values as calculated by netlib blas + tr_d1= 1.0; + tr_d2= 1.0; + tr_x1= 16.0; + tr_y1= 8.0; + + tr_param[0]=1.0; + tr_param[1]=1.0; + tr_param[2]=0.0; + tr_param[3]=0.0; + tr_param[4]=1.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +CTEST(drotmg, drotmg_D1_big_D2_big_flag_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]={1.,4096.,-4096.,1.,4096.}; + double tr_param[5]={-1.,4096.,-3584.,1792.,4096.}; + int i=0; + te_d1= tr_d1=1600000000.; + te_d2= tr_d2=800000000.; + te_x1= tr_x1=8.; + te_y1= tr_y1=7.; + + + //reference values as calculated by gonum + tr_d1= 68.96627824858757; + tr_d2= 34.483139124293785; + tr_x1= 45312.; + tr_y1= 7.0; + + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +CTEST(axpy,daxpy_inc_0) +{ + blasint i; + blasint N=8,incX=0,incY=0; + double a=0.25; + double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={4.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + + //OpenBLAS + BLASFUNC(daxpy)(&N,&a,x1,&incX,y1,&incY); + + for(i=0; i #include +#include #include #include "openblas_utest.h" @@ -41,7 +42,7 @@ static void* xmalloc(size_t n) void* tmp; tmp = malloc(n); if (tmp == NULL) { - fprintf(stderr, "You are about to die\n"); + fprintf(stderr, "Failed to allocate memory for the testcase.\n"); exit(1); } else { return tmp; @@ -103,6 +104,7 @@ exit(0); fork_pid = fork(); if (fork_pid == -1) { + perror("fork"); CTEST_ERR("Failed to fork process."); } else if (fork_pid == 0) { // Compute a DGEMM product in the child process to check that the @@ -113,7 +115,8 @@ exit(0); // recursively fork_pid_nested = fork(); if (fork_pid_nested == -1) { - CTEST_ERR("Failed to fork process."); + perror("fork"); + CTEST_ERR("Failed to fork nested process."); exit(1); } else if (fork_pid_nested == 0) { check_dgemm(a, b, d, c, n); diff --git a/utest/test_gemv.c b/utest/test_gemv.c new file mode 100644 index 00000000..dab6d2f1 --- /dev/null +++ b/utest/test_gemv.c @@ -0,0 +1,130 @@ +#include "openblas_utest.h" +#include + +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + +#ifdef BUILD_SINGLE + +CTEST(sgemv, 0_nan_inf) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + float alpha = 0.0; + float beta = 0.0; + char trans = 'N'; + float A[17 * 17]; + float X[17]; + float Y[17]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (N - 1); i += 2) + { + Y[i] = NAN; + Y[i + 1] = INFINITY; + } + Y[N - 1] = NAN; + BLASFUNC(sgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); + for (i = 0; i < N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(sgemv, 0_nan_inf_incy_2) +{ + int i; + blasint N = 17; + blasint Ny = 33; + blasint incX = 1; + blasint incY = 2; + float alpha = 0.0; + float beta = 0.0; + char trans = 'N'; + float A[17 * 17]; + float X[17]; + float Y[33]; + float *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (N - 1); i += 2) + { + ay[0] = NAN; + ay += 2; + ay[0] = INFINITY; + ay += 2; + } + Y[Ny - 1] = NAN; + BLASFUNC(sgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); + for (i = 0; i < Ny; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +#endif + +#ifdef BUILD_DOUBLE +CTEST(dgemv, 0_nan_inf) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + double alpha = 0.0; + double beta = 0.0; + char trans = 'N'; + double A[17 * 17]; + double X[17]; + double Y[17]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (N - 1); i += 2) + { + Y[i] = NAN; + Y[i + 1] = INFINITY; + } + Y[N - 1] = NAN; + BLASFUNC(dgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); + for (i = 0; i < N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(dgemv, 0_nan_inf_incy_2) +{ + int i; + blasint N = 17; + blasint Ny = 33; + blasint incX = 1; + blasint incY = 2; + double alpha = 0.0; + double beta = 0.0; + char trans = 'N'; + double A[17 * 17]; + double X[17]; + double Y[33]; + double *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (N - 1); i += 2) + { + ay[0] = NAN; + ay += 2; + ay[0] = INFINITY; + ay += 2; + } + Y[Ny - 1] = NAN; + BLASFUNC(dgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); + for (i = 0; i < Ny; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +#endif diff --git a/utest/test_post_fork.c b/utest/test_post_fork.c index 6d640aeb..d6e87f2b 100644 --- a/utest/test_post_fork.c +++ b/utest/test_post_fork.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #ifdef USE_OPENMP #include @@ -44,7 +45,7 @@ static void* xmalloc(size_t n) void* tmp; tmp = malloc(n); if (tmp == NULL) { - fprintf(stderr, "You are about to die\n"); + fprintf(stderr, "Failed to allocate memory for the test payload.\n"); exit(1); } else { return tmp; @@ -114,7 +115,11 @@ exit(0); fork_pid = fork(); if (fork_pid == -1) { - CTEST_ERR("Failed to fork process."); + perror("fork"); + CTEST_ERR("Failed to fork subprocesses in a loop."); +#ifdef USE_OPENMP + CTEST_ERR("Number of OpenMP threads was %d in this attempt.",i); +#endif } else if (fork_pid == 0) { // Just pretend to do something, e.g. call `uname`, then exit exit(0); diff --git a/utest/test_potrs.c b/utest/test_potrs.c index f39287d6..642ce1e3 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" - +#pragma GCC optimize("no-gcse") /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, diff --git a/utest/test_zscal.c b/utest/test_zscal.c index ffc851e8..09e63752 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -1,5 +1,449 @@ #include "openblas_utest.h" #include +#ifdef BUILD_SINGLE + +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + +CTEST(sscal, 0_nan) +{ + blasint N=9; + blasint incX=1; + float i = 0.0; + float x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, 0_nan_inc_2) +{ + blasint N=9; + blasint incX=2; + float i = 0.0; + float x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, nan_0) +{ + blasint N=9; + blasint incX=1; + float i = NAN; + float x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, nan_0_inc_2) +{ + blasint N=9; + blasint incX=2; + float i = NAN; + float x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, 0_inf) +{ + blasint N=9; + blasint incX=1; + float i = 0.0; + float x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, 0_inf_inc_2) +{ + blasint N=9; + blasint incX=2; + float i = 0.0; + float x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, + INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, inf_0) +{ + blasint N=9; + blasint incX=1; + float i = INFINITY; + float x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, inf_0_inc_2) +{ + blasint N=9; + blasint incX=2; + float i = INFINITY; + float x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, nan_inf) +{ + blasint N=9; + blasint incX=1; + float i = NAN; + float x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, nan_inf_inc_2) +{ + blasint N=9; + blasint incX=2; + float i = NAN; + float x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, + INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, inf_nan) +{ + blasint N=9; + blasint incX=1; + float i = INFINITY; + float x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(sscal, inf_nan_inc_2) +{ + blasint N=9; + blasint incX=2; + float i = INFINITY; + float x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(sscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +#endif + +#ifdef BUILD_DOUBLE + +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + +CTEST(dscal, 0_nan) +{ + blasint N=9; + blasint incX=1; + double i = 0.0; + double x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, 0_nan_inc_2) +{ + blasint N=9; + blasint incX=2; + double i = 0.0; + double x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, nan_0) +{ + blasint N=9; + blasint incX=1; + double i = NAN; + double x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, nan_0_inc_2) +{ + blasint N=9; + blasint incX=2; + double i = NAN; + double x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, 0_inf) +{ + blasint N=9; + blasint incX=1; + double i = 0.0; + double x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, 0_inf_inc_2) +{ + blasint N=9; + blasint incX=2; + double i = 0.0; + double x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, + INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, inf_0) +{ + blasint N=9; + blasint incX=1; + double i = INFINITY; + double x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, inf_0_inc_2) +{ + blasint N=9; + blasint incX=2; + double i = INFINITY; + double x[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, nan_inf) +{ + blasint N=9; + blasint incX=1; + double i = NAN; + double x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, nan_inf_inc_2) +{ + blasint N=9; + blasint incX=2; + double i = NAN; + double x[] = {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, + INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, inf_nan) +{ + blasint N=9; + blasint incX=1; + double i = INFINITY; + double x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +CTEST(dscal, inf_nan_inc_2) +{ + blasint N=9; + blasint incX=2; + double i = INFINITY; + double x[] = {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN}; + BLASFUNC(dscal)(&N, &i, x, &incX); + ASSERT_TRUE(isnan(x[0])); + ASSERT_TRUE(isnan(x[8])); +} + +#endif + +#ifdef BUILD_COMPLEX + +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + +CTEST(cscal, i_nan) +{ + blasint N=9; + blasint incX=1; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(cscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(cscal, i_nan_inc_2) +{ + blasint N=9; + blasint incX=2; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, + NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(cscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(cscal, nan_i) +{ + blasint N=9; + blasint incX=1; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(cscal)(&N, nan, i, &incX); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(cscal, nan_i_inc_2) +{ + blasint N=9; + blasint incX=2; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(cscal)(&N, nan, i, &incX); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(cscal, i_inf) +{ + blasint N=9; + blasint incX=1; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + BLASFUNC(cscal)(&N, i, inf, &incX); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(cscal, i_inf_inc_2) +{ + blasint N=9; + blasint incX=2; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, + INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + BLASFUNC(cscal)(&N, i, inf, &incX); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(cscal, inf_i) +{ + blasint N=9; + blasint incX=1; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + BLASFUNC(cscal)(&N, inf, i, &incX); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +CTEST(cscal, inf_i_inc_2) +{ + blasint N=9; + blasint incX=2; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + BLASFUNC(cscal)(&N, inf, i, &incX); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +CTEST(cscal, i_0inf) +{ + blasint N=9; + blasint incX=1; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float inf[] = {0,INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY}; + BLASFUNC(cscal)(&N, i, inf, &incX); + ASSERT_TRUE(isinf(inf[0])); + ASSERT_TRUE(isnan(inf[1])); + ASSERT_TRUE(isinf(inf[16])); + ASSERT_TRUE(isnan(inf[17])); +} + +CTEST(cscal, i_0inf_inc_2) +{ + blasint N=9; + blasint incX=2; + float i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + float inf[] = {0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, + 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY}; + BLASFUNC(cscal)(&N, i, inf, &incX); + ASSERT_TRUE(isinf(inf[0])); + ASSERT_TRUE(isnan(inf[1])); + ASSERT_TRUE(isinf(inf[16])); + ASSERT_TRUE(isnan(inf[17])); +} + +#endif + #ifdef BUILD_COMPLEX16 #ifndef NAN @@ -11,9 +455,11 @@ CTEST(zscal, i_nan) { + blasint N=9; + blasint incX=1; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; - cblas_zscal(9, i, &nan, 1); + BLASFUNC(zscal)(&N, i, nan, &incX); ASSERT_TRUE(isnan(nan[0])); ASSERT_TRUE(isnan(nan[1])); ASSERT_TRUE(isnan(nan[16])); @@ -22,10 +468,12 @@ CTEST(zscal, i_nan) CTEST(zscal, i_nan_inc_2) { + blasint N=9; + blasint incX=2; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; - cblas_zscal(9, i, &nan, 2); + BLASFUNC(zscal)(&N, i, nan, &incX); ASSERT_TRUE(isnan(nan[0])); ASSERT_TRUE(isnan(nan[1])); ASSERT_TRUE(isnan(nan[16])); @@ -34,9 +482,11 @@ CTEST(zscal, i_nan_inc_2) CTEST(zscal, nan_i) { + blasint N=9; + blasint incX=1; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; - cblas_zscal(9, &nan, &i, 1); + BLASFUNC(zscal)(&N, nan, i, &incX); ASSERT_TRUE(isnan(i[0])); ASSERT_TRUE(isnan(i[1])); ASSERT_TRUE(isnan(i[16])); @@ -45,10 +495,12 @@ CTEST(zscal, nan_i) CTEST(zscal, nan_i_inc_2) { + blasint N=9; + blasint incX=2; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; - cblas_zscal(9, &nan, &i, 2); + BLASFUNC(zscal)(&N, nan, i, &incX); ASSERT_TRUE(isnan(i[0])); ASSERT_TRUE(isnan(i[1])); ASSERT_TRUE(isnan(i[16])); @@ -57,9 +509,11 @@ CTEST(zscal, nan_i_inc_2) CTEST(zscal, i_inf) { + blasint N=9; + blasint incX=1; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; - cblas_zscal(9, i, &inf, 1); + BLASFUNC(zscal)(&N, i, inf, &incX); ASSERT_TRUE(isnan(inf[0])); ASSERT_TRUE(isinf(inf[1])); ASSERT_TRUE(isnan(inf[16])); @@ -68,10 +522,12 @@ CTEST(zscal, i_inf) CTEST(zscal, i_inf_inc_2) { + blasint N=9; + blasint incX=2; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; - cblas_zscal(9, i, &inf, 2); + BLASFUNC(zscal)(&N, i, inf, &incX); ASSERT_TRUE(isnan(inf[0])); ASSERT_TRUE(isinf(inf[1])); ASSERT_TRUE(isnan(inf[16])); @@ -80,9 +536,11 @@ CTEST(zscal, i_inf_inc_2) CTEST(zscal, inf_i) { + blasint N=9; + blasint incX=1; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; - cblas_zscal(9, &inf, &i, 1); + BLASFUNC(zscal)(&N, inf, i, &incX); ASSERT_TRUE(isnan(i[0])); ASSERT_TRUE(isinf(i[1])); ASSERT_TRUE(isnan(i[16])); @@ -91,14 +549,43 @@ CTEST(zscal, inf_i) CTEST(zscal, inf_i_inc_2) { + blasint N=9; + blasint incX=2; double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; - cblas_zscal(9, &inf, &i, 2); + BLASFUNC(zscal)(&N, inf, i, &incX); ASSERT_TRUE(isnan(i[0])); ASSERT_TRUE(isinf(i[1])); ASSERT_TRUE(isnan(i[16])); ASSERT_TRUE(isinf(i[17])); } +CTEST(zscal, i_0inf) +{ + blasint N=9; + blasint incX=1; + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {0,INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY}; + BLASFUNC(zscal)(&N, i, inf, &incX); + ASSERT_TRUE(isinf(inf[0])); + ASSERT_TRUE(isnan(inf[1])); + ASSERT_TRUE(isinf(inf[16])); + ASSERT_TRUE(isnan(inf[17])); +} + +CTEST(zscal, i_0inf_inc_2) +{ + blasint N=9; + blasint incX=2; + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, + 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY, 0,INFINITY}; + BLASFUNC(zscal)(&N, i, inf, &incX); + ASSERT_TRUE(isinf(inf[0])); + ASSERT_TRUE(isnan(inf[1])); + ASSERT_TRUE(isinf(inf[16])); + ASSERT_TRUE(isnan(inf[17])); +} + #endif